In [2]:
#!/usr/bin/python

import sys
from time import time
import math
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
from pandas import DataFrame
from numpy.lib.function_base import average
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from backports import tempfile
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import make_scorer
from tester import dump_classifier_and_data
sys.path.append('../tools/')


def load_data(file_path):
    '''
    Retrieve the dataset stored in the specific pickle file path.

    Args:
        file_path : string
            The absolute file path for the pickle file containing the data.

    Returns:
        dataset : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.
    '''
    # Load the dictionary containing the dataset
    file = open(file_path, 'r')
    dataset = pickle.load(file)
    file.close()

    return dataset


def get_clean_enron_dataframe(enron_data):
    '''
    Performs cleaning operations on the enron_data_frame.

    Args:
        enron_data : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.

    Returns:
        enron_data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            pandas format, after cleaning the data.
    '''
    pd.options.display.float_format = '{:20,.2f}'.format
    enron_data_frame = DataFrame.from_dict(enron_data, orient='index')
    # Drop unwanted columns.
    enron_data_frame.drop('TOTAL', axis=0, inplace=True)
    enron_data_frame.drop('email_address', axis=1, inplace=True)
    # All NaN strings are converted to Numpy nan values, which allows the
    # describe function to produce proper numeric values for all statistics.
    enron_data_frame.replace('NaN', np.NaN, regex=True, inplace=True)
    # Convert True to 1 and False to 0.
    enron_data_frame.replace({True: 1, False: 0}, inplace=True)

    return enron_data_frame


def print_missing_values_table(data_frame):
    '''
    Generate a series of statistics for each one of the features found in the
    dataframe in order to understand better the data.

    Adapted from:
    https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction

    Args:
        data_frame : DataFrame
            DataFrame we want to inspect for columns with missing values.

    Returns:
        missing_values_table : DataFrame
            DataFrame containing the missing values statistics for the
            data_frame columns.
    '''
    missing_values = data_frame.isna().sum()
    missing_values_percentage = 100 * missing_values / len(data_frame)
    missing_values_table = pd.concat([missing_values,
                                      missing_values_percentage], axis=1)
    # Rename the columns.
    missing_values_table.columns = ['Missing Values', '% of Total Values']
    # Leave on the table only the columns that are missing values.
    columns_missing_values = missing_values_table.iloc[:, 1] != 0
    missing_values_table = missing_values_table[columns_missing_values]
    # Sort the table by percentage of missing descending.
    missing_values_table = missing_values_table.sort_values(
                                '% of Total Values', ascending=False).round(1)
    # Print some summary information.
    print('\nColumns in dataframe: {}.'.format(data_frame.shape[1]))
    print('Columns missing values: {}.'.format(missing_values_table.shape[0]))
    print('\nMissing values table:')
    display(missing_values_table)

    return missing_values_table


def print_target_correlation_report(correlations_table, label_column_name):
    '''
    Generate a report for the most positive and most negative feature
    correlations with the target feature.

    Args:
        data_frame : DataFrame
            DataFrame we want to show correlations with the target feature for.
        correlations_table : DataFrame
            DataFrame containing the correlations between all data features.
        label_column : string
            The name of the column containing the labels for each data point in
            the DataFrame.

    Returns:
        None
    '''
    target_correlations = correlations_table[label_column_name]
    absolute_target_correlations = abs(target_correlations)
    target_correlations_table = pd.concat([target_correlations,
                                           absolute_target_correlations],
                                          axis=1)
    # Rename the columns.
    target_correlations_table.columns = ['Correlation', 'Absolute Correlation']
    # Leave only the features that aren't the target or have a nan value.
    target_correlations_table.drop(label_column_name, axis=0, inplace=True)
    correlation_features = pd.notnull(target_correlations_table.iloc[:, 1])
    target_correlations_table = target_correlations_table[correlation_features]
    # Sort the table by percentage of missing descending.
    target_correlations_table.sort_values('Absolute Correlation',
                                          ascending=False, inplace=True)
    print('\nFeature correlations to ({}) feature:'.format(label_column_name))
    display(target_correlations_table)


def display_correlation_heatmap(data_frame):
    '''
    Generate a table and heatmap to allow visualization of the correlations
    between input features in the dataset.

    Adapted from:
    https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction

    Args:
        data_frame : DataFrame
            DataFrame we want to show correlations for.

    Returns:
        correlations_table : DataFrame
            DataFrame containing the correlations between all data features.
    '''
    correlations_table = data_frame.corr()
    print('\nCorrelation Heatmap:')
    plt.figure(figsize=(16, 12))
    sns.heatmap(correlations_table, cmap='Blues', annot=True)
    plt.title('Correlation Heatmap')
    plt.show()

    return correlations_table


def describe_dataset(data_frame, label_column_name):
    '''
    Generate a series of statistics for each one of the features found in the
    dataframe in order to understand better the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.

    Returns:
        None
    '''
    print('\nDataFrame head:')
    display(data_frame)
    print('\nEnron data point count: {}'.format(len(data_frame)))
    print('\nDataFrame info:')
    data_frame.info()
    print('\nDataFrame description:')
    display(data_frame.describe())
    print_missing_values_table(data_frame)
    correlations_table = display_correlation_heatmap(data_frame)
    print('\nLabel value counts:\n{}'.format(
                                data_frame[label_column_name].value_counts()))
    print_target_correlation_report(correlations_table, label_column_name)


def draw_features_boxplots(data_frame, label_column_name, plot_columns):
    '''
    Generate a box plot for each one of the features in a dataframe, in order
    to visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    # Separate labels from features for easier plotting.
    labels = data_frame[label_column_name]
    data = data_frame.drop(label_column_name, axis=1)
    # Get the total columns in data, divide it by plot_columns and round it up
    # to get the rows we need to accommodate all features in plot_columns.
    plot_rows = int(math.ceil(float(data.shape[1]) / plot_columns))
    plot_height = plot_rows * 4
    _, axes = plt.subplots(plot_rows, plot_columns, figsize=(16, plot_height))
    figure_count = 0
    print('\nFeature Boxplots:')
    for column in data.columns:
        # Create a dataframe for plotting, with labels and the current column.
        plot_data = pd.concat([labels, data.loc[:, column]], axis=1)
        # Transform the dataframe to the required format using melt.
        plot_data = pd.melt(plot_data, id_vars=label_column_name,
                            var_name=column, value_name='value')
        figure_row = figure_count / plot_columns
        figure_col = figure_count % plot_columns
        figure_count += 1
        ax = axes[figure_row, figure_col]
        sns.boxplot(ax=ax, data=plot_data, hue=label_column_name, x=column,
                    y='value')
        ax.set_xlabel('')
        ax.set_ylabel('')

    plt.show()


def draw_features_swarmplots(data_frame, label_column_name, plot_columns):
    '''
    Generate a swarm plot for each one of the features in a dataframe, in order
    to visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    # Separate labels from features for easier plotting.
    labels = data_frame[label_column_name]
    data = data_frame.drop(label_column_name, axis=1)
    # Get the total columns in data, divide it by plot_columns and round it up
    # to get the rows we need to accommodate all features in plot_columns.
    plot_rows = int(math.ceil(float(data.shape[1]) / plot_columns))
    plot_height = plot_rows * 4
    _, axes = plt.subplots(plot_rows, plot_columns, figsize=(16, plot_height))
    figure_count = 0
    print('\nFeature Swarmplots:')
    for column in data.columns:
        # Create a dataframe for plotting, with labels and the current column.
        plot_data = pd.concat([labels, data.loc[:, column]], axis=1)
        # Transform the dataframe to the required format using melt.
        plot_data = pd.melt(plot_data, id_vars=label_column_name,
                            var_name=column, value_name='value')
        figure_row = figure_count / plot_columns
        figure_col = figure_count % plot_columns
        figure_count += 1
        ax = axes[figure_row, figure_col]
        sns.swarmplot(ax=ax, data=plot_data, hue=label_column_name, x=column,
                    y='value')
        ax.set_xlabel('')
        ax.set_ylabel('')

    plt.show()


def plot_features(data_frame, label_column_name, plot_columns):
    '''
    Generate a graphic for each one of the features in a dataframe, in order to
    visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    draw_features_boxplots(data_frame, label_column_name, plot_columns)
    draw_features_swarmplots(data_frame, label_column_name, plot_columns)
    # print('\nFeature plots:')
    # plt.figure(figsize=(32, 24))
    # sns.pairplot(features, hue=labels)
    # plt.title('Correlation Heatmap')
    # plt.show()


def get_enron_feature_list():
    '''
    Retrieve the feature list to be used for the Enron POI classification
    problem:

    Financial features (all units are in US dollars):
        salary, deferral_payments, total_payments, loan_advances, bonus,
        restricted_stock_deferred, deferred_income, total_stock_value,
        expenses, exercised_stock_options, other, long_term_incentive,
        restricted_stock, director_fees
    Email features ('email_address' is string, the rest, email message counts):
        email_address, to_messages, from_poi_to_this_person, from_messages,
        from_this_person_to_poi, shared_receipt_with_poi
    POI label (boolean, represented as integer):
        poi

    Args:
        None

    Returns:
        features_list : list
            The list of features that will be used for solving the POI
            classification problem.
    '''
    # The first feature must be 'poi'.
    features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
                     'loan_advances', 'bonus', 'restricted_stock_deferred',
                     'deferred_income', 'total_stock_value', 'expenses',
                     'exercised_stock_options', 'other', 'long_term_incentive',
                     'restricted_stock', 'director_fees', 'to_messages',
                     'from_poi_to_this_person', 'from_messages',
                     'from_this_person_to_poi', 'shared_receipt_with_poi']

    return features_list


def get_labels_features(data_dictionary, feature_list):
    """
    Retrieve the labels and features for the given dataset, after applying
    some arranging and cleaning operations:
    - Keys (record IDs) are sorted by alphabetical order.
    - NaN strings are converted to 0.0.
    - Data points where all features have a value of zero are removed.

    Note that the first feature is assumed to be the label feature and is not
    used for determining if the data point should be removed or not.

    Args:
        data_dictionary : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.
        feature_list : list
            The list of features that needs to be extracted from the dictionary
            and returned for the classification problem. The first feature on
            the list needs to contain the data labels.

    Returns:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        features : ndarray
            Array with the features for each data point in the dataset.
    """
    labels = []
    features = []
    keys = sorted(data_dictionary.keys())
    for key in keys:
        data_point_values = []
        # Get the data point values in a list.
        for feature in feature_list:
            try:
                data_dictionary[key][feature]
            except KeyError:
                print('Error: key {} not present'.format(feature))

            value = data_dictionary[key][feature]
            if value == 'NaN':
                value = 0
            data_point_values.append(float(value))

        # Logic for deciding whether or not to add the data point. The first
        # feature is assumed to be the label feature, and is not considered.
        label_value = data_point_values[0]
        feature_values = data_point_values[1:]
        for value in feature_values:
            if value != 0 and value != 'NaN':
                labels.append(np.array(label_value))
                features.append(np.array(feature_values))
                break

    labels = np.array(labels)
    features = np.array(features)
    print('\nCurrent features and labels shapes:')
    print('Enron labels shape: {}'.format(labels.shape))
    print('Enron features shape: {}'.format(features.shape))

    return labels, features


def get_best_enron_features(labels, features, feature_list, top_n_features):
    '''
    Select the best features to use automatically in a classification problem,
    by using the RandomForestClassifier feature importances.

    Args:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        features : ndarray
            Array with the features for each data point in the dataset.
        feature_list : list
            The list of features that needs to be extracted from the dictionary
            and returned. The first feature is expected to one with the labels.
        top_n_features : integer
            Is the number of features that will be selected from the original
            dataset, according to their importance.

    Returns:
        best_features_list : list
            The list of the best features that will be used for solving the POI
            classification problem.
    '''
    model = RandomForestClassifier(n_estimators=500, n_jobs=8, random_state=42)
    model.fit(features, labels)
    importances = model.feature_importances_
    indices = np.argsort(importances)
    sorted_features = [feature_list[1:][index] for index in indices]
    best_features_list = [feature_list[0]]
    best_features_list.extend(sorted_features[-top_n_features:])
    print('\nSelected features (with label):\n{}'.format(best_features_list))

    # plotting feature importances
    print('\nFeature Importances:')
    plt.figure(figsize=(16, 12))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b')
    plt.yticks(range(len(indices)), sorted_features)
    plt.xlabel('Relative Importance')
    plt.show()

    return best_features_list


def remove_enron_outliers(enron_data):
    '''
    Return the labels and features for the Enron dataset, after eliminating the
    outlier data points from the different features.

    Args:
        enron_data : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.

    Returns:
        enron_data : Dictionary
            Dictionary containing the data after removing the outliers.

    '''
    negatives_removal_features = ['deferral_payments', 'restricted_stock',
                                  'total_stock_value']
    keys = sorted(enron_data.keys())
    removed_outliers = 0
    for key in keys:
        for feature in negatives_removal_features:
            try:
                value = enron_data[key][feature]
                if value < 0:
                    enron_data[key][feature] = 0
                    removed_outliers += 1
            except KeyError:
                print('Error: key {} not present'.format(feature))

    print('\nOutlier features:\n{}'.format(negatives_removal_features))
    print('Total outliers removed:\n{}'.format(removed_outliers))

    return enron_data


def add_enron_features(labels, features):
    '''
    Return the labels and features for the Enron dataset, after adding new
    relevant features to help improve the classification performance.

    Args:
        labels : ndarray
            Array with the labels for each data point in the enron dataset.
        features : ndarray
            Array with the features for each data point in the enron dataset.

    Returns:
        labels : ndarray
            Array with the labels for each data point, after adding the new
            features.
        features : ndarray
            Array with the features for each data point, after adding the new
            features.
    '''

    return labels, features


def get_pipelines_definitions():
    '''
    Define the different pipelines that will be used to train and finetune the
    classification model.

    Args:
        None

    Returns:
        pipelines : Dictionary
            A dictionary containing all the pipelines that will be used to fit
            the model in order to select the one that produces the best results
            for the given problem.
    '''
    scale_variations = [None, RobustScaler(), MinMaxScaler(), Normalizer()]
    reduce_dim_variations = [None, PCA(2), PCA(3), PCA(4)]
    pipelines = {
        'GaussianNB': [{
            'classify': [GaussianNB()],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations
        }],
        'DecisionTreeClassifier': [{
            'classify': [DecisionTreeClassifier(random_state=42)],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__criterion': ['entropy', 'gini'],
            'classify__splitter': ['best', 'random'],
            'classify__min_samples_split': [2, 4, 8, 16, 32, 64]
        }],
        # I wasn't able to make SVC work with the 'linear' or 'poly' kernels.
        'SVC': [{
                    'classify': [SVC(random_state=42)],
                    'scale': scale_variations,
                    'reduce_dim': reduce_dim_variations,
                    'classify__kernel': ['rbf'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000],
                }, {
                    'classify': [SVC(random_state=42)],
                    'scale': scale_variations,
                    'reduce_dim': reduce_dim_variations,
                    'classify__kernel': ['sigmoid'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000]
                }, {
                    'classify': [SVC(random_state=42)],
                    # With other scalers the search won't finish.
                    'scale': [None, MinMaxScaler()],
                    # With values over 6 (8, 16, None) the search won't finish.
                    'reduce_dim': [PCA(2), PCA(4)],
                    'classify__kernel': ['poly'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000],
                    # With a value of 2 or 3 the search won't finish.
                    'classify__degree': [4, 5]
            }],
        'KNeighborsClassifier': [{
            'classify': [KNeighborsClassifier()],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__n_neighbors': [2, 4, 8, 16, 32],
            'classify__weights': ['uniform', 'distance'],
            'classify__algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'classify__p': [1, 2]
        }],
        'RandomForestClassifier': [{
            'classify': [RandomForestClassifier(random_state=42)],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__n_estimators': [4, 8, 16],
            'classify__criterion': ['entropy', 'gini'],
            'classify__min_samples_split': [4, 8, 16],
            'classify__max_depth': [4, 8, 16],
            'classify__max_features': [None, 'sqrt', 'log2']
        }],
        'AdaBoostClassifier': [{
                'classify': [AdaBoostClassifier(random_state=42)],
                'scale': scale_variations,
                'reduce_dim': reduce_dim_variations,
                'classify__base_estimator': [
                    None,
                    SVC(kernel='poly', gamma='scale', degree=5),
                    DecisionTreeClassifier(splitter='random')
                ],
                'classify__n_estimators': [32, 64, 128],
                'classify__algorithm': ['SAMME'],
                'classify__learning_rate': [0.05, 0.1, 0.3, 1]
        }],
        'KMeans': [{
                'classify': [KMeans(random_state=42)],
                'scale': scale_variations,
                'reduce_dim': reduce_dim_variations,
                'classify__n_clusters': [2]
        }]
    }

    return pipelines


def get_dummy_pipeline_with_memory():
    '''
    Return a pipeline to be used in a search strategy (e.g. GridSearchCV,
    RandomSearchCV, etc.), with the correct steps in the right sequence, but
    initialized with arbitrary estimators (because the specific estimators
    to use in the search will be defined by means of the param_grid).

    The returned pipeline uses memory to improve search performance.

    Args:
        None

    Returns:
        pipeline : Pipeline
            A Pipeline object with the desired steps in the proper sequence,
            but initialized with arbitrary estimators, and with memory usage
            enabled.
    '''
    with tempfile.TemporaryDirectory(prefix='poi_id_') as tmpdir:
        # The steps used are just for initializing the pipeline. The actual
        # steps are defined inside the param_grid.
        pipeline = Pipeline(steps=[('scale', RobustScaler()),
                                   ('reduce_dim', PCA()),
                                   ('classify', GaussianNB())],
                            memory=tmpdir)

    return pipeline


def get_best_estimator_metrics(results, metrics):
    '''
    Process the search results DataFrame and extract from it the metrics for
    the best estimator.

    Args:
        results : DataFrame
            DataFrame with the results of the grid search.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search. The first metric in the list is
            assumed to be the main metric, which was used to select the best
            estimator.

    Returns:
        estimator_metrics : list
            List containing the best estimator's values for the metrics
            evaluated during the search.
    '''
    estimator_metrics = []
    best_estimator_string = 'Best Estimator {}: {:.4f}'

    main_metric_name = 'mean_test_' + metrics[0]
    main_metric_results = results[main_metric_name]
    main_metric_value = max(main_metric_results)
    main_metric_index = np.argmax(main_metric_results, axis=0)
    print(best_estimator_string.format(metrics[0].title(), main_metric_value))
    estimator_metrics.append(main_metric_value)

    for metric in metrics[1:]:
        full_metric_name = 'mean_test_' + metric
        metric_results = results[full_metric_name]
        metric_value = metric_results[main_metric_index]
        print(best_estimator_string.format(metric.title(), metric_value))
        estimator_metrics.append(metric_value)

    return estimator_metrics


def add_best_metric_value_marker(results, axe, x_values, metric, color):
    '''
    For a metric, plot a dotted vertical line marked with an x at the best
    score obtained, and annotate it with the value for that score.

    Args:
        results : DataFrame
            DataFrame with the results of the grid search.
        axe : Axes
            Axe where we'll plot the dotted vertical line.
        x_values : ndarray
            Array with the values used for the chart's X axis.
        metric : string
            The name of the metric whose best value we want to mark.
        color : string
            The code of the color we want to mark the best value with.

    Returns:
        None
    '''
    best_index = np.nonzero(results['rank_test_%s' % metric] == 1)[0][0]
    best_score = results['mean_test_%s' % metric][best_index]
    axe.plot([x_values[best_index], ] * 2, [0, best_score], linestyle='-.',
             color=color, marker='x', markeredgewidth=3, ms=8)
    axe.annotate('%0.2f' % best_score,
                 (x_values[best_index], best_score + 0.005))


def plot_estimator_metrics(estimator, metrics, results):
    '''
    Generate a graphic graphic comparing the results obtained for each one of
    the different candidates, for each one of the different scoring metrics
    used for the estimator search.

    Args:
        estimator : string
            The name of the estimator whose results are going to be plotted.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search. The first metric in the list is
            assumed to be the main metric, which was used to select the best
            estimator.
        results : DataFrame
            DataFrame with the results of the estimator's grid search.

    Returns:
        None
    '''
    # TODO explore with a pivot table in pandas that gets the average metric
    # score (for all metrics evaluated) for each value of a parameter. This
    # could be then plotted to see the impact of the specific parameter on
    # the results. Iterating over the different parameters we would end up
    # with a group of charts (one per parameter) to detect those parameters
    # most important for solving the particular problem.
    main_metric_name = 'mean_test_' + metrics[0]
    data_points = len(results[main_metric_name])
    x_values = np.arange(data_points)
    plt.figure(figsize=(20, 10))
    plt.title('Results for ' + estimator, fontsize=16)
    plt.xlabel('Candidates')
    plt.ylabel('Score')
    axe = plt.gca()
    axe.set_xlim(0, data_points - 1)
    axe.set_ylim(0.0, 1.0)

    for metric, color in zip(sorted(metrics), ['g', 'k', 'b', 'r']):
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, metric)]
            sample_score_std = results['std_%s_%s' % (sample, metric)]
            axe.fill_between(x_values, sample_score_mean - sample_score_std,
                             sample_score_mean + sample_score_std,
                             alpha=0.1 if sample == 'test' else 0, color=color)
            axe.plot(x_values, sample_score_mean, style, color=color,
                     alpha=1 if sample == 'test' else 0.7,
                     label='%s (%s)' % (metric, sample))

        add_best_metric_value_marker(results, axe, x_values, metric, color)

    plt.legend(loc='best')
    plt.grid(False)
    plt.show()


def get_best_estimator(features, labels, pipelines, cv_strategy, metrics):
    '''
    Get the best estimator from the pipelines, cross validation, metrics and
    refit metrics specified for the search strategy.

    Args:
        features : ndarray
            Array with the features for each data point in the enron dataset.
        labels : ndarray
            Array with the labels for each data point in the enron dataset.
        pipelines : Dictionary
            Dictionary with specification of the different pipelines we want to
            use to try and solve this particular problem.
        cv_strategy : cross-validation generator
            Method from the model_selection package that defines a cross
            validation strategy to be used for this particular problem.
        metrics : Dictionary
            Dictionary containing the names of the different metrics we want to
            measure for each one of the evaluated estimators. The first metric
            in the Dictionary is assumed to be the main metric to use for
            choosing the best estimator.

    Returns:
        best_results : DataFrame
            DataFrame with the best results of the grid search.
        best_estimator : Object
            This is the best estimator that was found during the search.
    '''
    metric_names = list(metrics.keys())
    print('\nPerforming Model Optimizations...')
    best_main_metric_value = -1.0
    best_estimator = ''
    best_results = ''
    pipeline = get_dummy_pipeline_with_memory()
    for estimator, pipeline_definition in pipelines.items():
        print('\nAnalyzing {}...'.format(estimator))
        clf = GridSearchCV(pipeline, param_grid=pipeline_definition,
                           cv=cv_strategy, scoring=metrics,
                           refit=metric_names[0], iid=False,
                           n_jobs=8, verbose=True, error_score='raise',
                           return_train_score=True)
        clf.fit(features, labels)
        results = clf.cv_results_
        print('\nBest {} Found:\n{}\n'.format(estimator, clf.best_estimator_))
        best_estimator_metrics = get_best_estimator_metrics(results,
                                                            metric_names)
        plot_estimator_metrics(estimator, metric_names, results)
        if best_estimator_metrics[0] > best_main_metric_value:
            best_estimator = clf.best_estimator_
            best_results = results
            best_main_metric_value = best_estimator_metrics[0]

    return best_results, best_estimator


def custom_score(labels, predictions):
    '''
    Calculate the score for the predictions, based on the labels passed to the
    function, using a combination of accuracy, recall and precision, in an
    attempt to get a model with a good accuracy and good enough precision and
    recall values.

    Args:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        predictions : ndarray
            Array with the predictions for each data point in the dataset.

    Returns:
        total_score : double
            The score assigned to the model, given the predictions.
    '''
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    # accuracy is included twice in the average, to give increase its weight.
    total_score = average([accuracy, accuracy, precision, recall])

    return total_score


def print_overall_results(start_time, results, metrics, best_estimator):
    '''
    Print the best estimator with the respective metrics and other information.

    Args:
        start_time : float
            The time when the search process started.
        results : DataFrame
            DataFrame with the results of the estimator's grid search.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search.
        best_estimator : string
            The definition of the best estimator found.

    Returns:
        None
    '''
    training_time = round(time() - start_time, 3)
    print('\nTotal training time: {} s'.format(training_time))
    print('\nBest Overall Results:')
    get_best_estimator_metrics(results, list(metrics.keys()))
    print('\nBest Overall Estimator Found:\n{}'.format(best_estimator))


# Task 0: Load and explore the dataset and features.
enron_data = load_data('final_project_dataset.pkl')
enron_data_frame = get_clean_enron_dataframe(enron_data)
describe_dataset(enron_data_frame, 'poi')
plot_features(enron_data_frame, 'poi', 3)

# Task 1: Remove outliers
# enron_data = remove_enron_outliers(enron_data)

# Task 2: Select what features you'll use.
full_enron_feature_list = get_enron_feature_list()
labels, features = get_labels_features(enron_data, full_enron_feature_list)
enron_feature_list = get_best_enron_features(labels, features,
                                             full_enron_feature_list, 8)
labels, features = get_labels_features(enron_data, enron_feature_list)
labels, features = add_enron_features(labels, features)

# TODO Task 3: Create new feature(s)

# Task 4: Try a variety of classifiers
# Please name your classifier clf for easy export below.
# Note that if you want to do PCA or other multi-stage operations,
# you'll need to use Pipelines. For more info:
# http://scikit-learn.org/stable/modules/pipeline.html
pipelines = get_pipelines_definitions()

# Task 5: Tune your classifier to achieve better than .3 precision and recall
# using our testing script. Check the tester.py script in the final project
# folder for details on the evaluation method, especially the test_classifier
# function. Because of the small dataset size, the test script uses stratified
# shuffle split cross validation, so that's what we'll use here as well.
# For more info:
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
cv_strategy = StratifiedShuffleSplit(n_splits=100, random_state=42)
# We define all the scoring metrics we want to measure. Recall will be the one
# used to select the best set of parameters, and refit the identifier, because
# in this case false positives are far better than false negatives, since we
# don't want to risk missing ani pois. Recall needs to be the first metric on
# the list, because get_best_estimator assumes the one in that position to be
# the main metric to evaluate the select estimator.
start_time = time()
# To guarantee dictionary order, we pass an iterable of key-value pairs.
metrics = OrderedDict([
    ('overall', make_scorer(custom_score)),
    ('accuracy', 'accuracy'),
    ('recall', 'recall'),
    ('precision', 'precision'),
])
results, best_estimator = get_best_estimator(features, labels, pipelines,
                                             cv_strategy, metrics)
print_overall_results(start_time, results, metrics, best_estimator)

# TODO fix this. ¿Maybe refit is needed here before getting results?
# results = DataFrame.from_dict(best_estimator.cv_results_)
# results.head()

# Task 6: Dump your classifier, dataset, and features_list so anyone can check
# your results. You do not need to change anything below, but make sure that
# the version of poi_id.py that you submit can be run on its own and generates
# the necessary .pkl files for validating your results.
dump_classifier_and_data(best_estimator, enron_data, enron_feature_list)
DataFrame head:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
ALLEN PHILLIP K 201,955.00 2,902.00 2,869,717.00 4,484,442.00 1,729,541.00 4,175,000.00 126,027.00 1,407.00 -126,027.00 1,729,541.00 13,868.00 nan 2,195.00 152.00 65.00 0 nan -3,081,055.00 304,805.00 47.00
BADUM JAMES P nan nan 178,980.00 182,466.00 257,817.00 nan nan nan nan 257,817.00 3,486.00 nan nan nan nan 0 nan nan nan nan
BANNANTINE JAMES M 477.00 566.00 nan 916,197.00 4,046,157.00 nan 1,757,552.00 465.00 -560,222.00 5,243,487.00 56,301.00 nan 29.00 864,523.00 0.00 0 nan -5,104.00 nan 39.00
BAXTER JOHN C 267,102.00 nan 1,295,738.00 5,634,343.00 6,680,544.00 1,200,000.00 3,942,714.00 nan nan 10,623,258.00 11,200.00 nan nan 2,660,303.00 nan 0 nan -1,386,055.00 1,586,055.00 nan
BAY FRANKLIN R 239,671.00 nan 260,455.00 827,696.00 nan 400,000.00 145,796.00 nan -82,782.00 63,014.00 129,142.00 nan nan 69.00 nan 0 nan -201,641.00 nan nan
BAZELIDES PHILIP J 80,818.00 nan 684,694.00 860,136.00 1,599,641.00 nan nan nan nan 1,599,641.00 nan nan nan 874.00 nan 0 nan nan 93,750.00 nan
BECK SALLY W 231,330.00 7,315.00 nan 969,068.00 nan 700,000.00 126,027.00 2,639.00 nan 126,027.00 37,172.00 nan 4,343.00 566.00 386.00 0 nan nan nan 144.00
BELDEN TIMOTHY N 213,999.00 7,991.00 2,144,013.00 5,501,630.00 953,136.00 5,249,999.00 157,569.00 5,521.00 nan 1,110,705.00 17,355.00 nan 484.00 210,698.00 108.00 1 nan -2,334,434.00 nan 228.00
BELFER ROBERT nan nan -102,500.00 102,500.00 3,285.00 nan nan nan 44,093.00 -44,093.00 nan nan nan nan nan 0 3,285.00 nan nan nan
BERBERIAN DAVID 216,582.00 nan nan 228,474.00 1,624,396.00 nan 869,220.00 nan nan 2,493,616.00 11,892.00 nan nan nan nan 0 nan nan nan nan
BERGSIEKER RICHARD P 187,922.00 383.00 nan 618,850.00 nan 250,000.00 659,249.00 233.00 nan 659,249.00 59,175.00 nan 59.00 427,316.00 0.00 0 nan -485,813.00 180,250.00 4.00
BHATNAGAR SANJAY nan 523.00 nan 15,456,290.00 2,604,490.00 nan -2,604,490.00 463.00 15,456,290.00 nan nan nan 29.00 137,864.00 1.00 0 137,864.00 nan nan 0.00
BIBI PHILIPPE A 213,625.00 1,607.00 nan 2,047,593.00 1,465,734.00 1,000,000.00 378,082.00 1,336.00 nan 1,843,816.00 38,559.00 nan 40.00 425,688.00 8.00 0 nan nan 369,721.00 23.00
BLACHMAN JEREMY M 248,546.00 2,475.00 nan 2,014,835.00 765,313.00 850,000.00 189,041.00 2,326.00 nan 954,354.00 84,208.00 nan 14.00 272.00 2.00 0 nan nan 831,809.00 25.00
BLAKE JR. NORMAN P nan nan nan 1,279.00 nan nan nan nan nan nan 1,279.00 nan nan nan nan 0 113,784.00 -113,784.00 nan nan
BOWEN JR RAYMOND M 278,601.00 1,858.00 nan 2,669,589.00 nan 1,350,000.00 252,055.00 1,593.00 nan 252,055.00 65,907.00 nan 27.00 1,621.00 15.00 1 nan -833.00 974,293.00 140.00
BROWN MICHAEL nan 1,486.00 nan 49,288.00 nan nan nan 761.00 nan nan 49,288.00 nan 41.00 nan 1.00 0 nan nan nan 13.00
BUCHANAN HAROLD G 248,017.00 1,088.00 nan 1,054,637.00 825,464.00 500,000.00 189,041.00 23.00 nan 1,014,505.00 600.00 nan 125.00 1,215.00 0.00 0 nan nan 304,805.00 0.00
BUTTS ROBERT H 261,516.00 nan nan 1,271,582.00 nan 750,000.00 417,619.00 nan nan 417,619.00 9,410.00 nan nan 150,656.00 nan 0 nan -75,000.00 175,000.00 nan
BUY RICHARD B 330,546.00 3,523.00 649,584.00 2,355,702.00 2,542,813.00 900,000.00 901,657.00 2,333.00 nan 3,444,470.00 nan nan 1,053.00 400,572.00 71.00 0 nan -694,862.00 769,862.00 156.00
CALGER CHRISTOPHER F 240,189.00 2,598.00 nan 1,639,297.00 nan 1,250,000.00 126,027.00 2,188.00 nan 126,027.00 35,818.00 nan 144.00 486.00 25.00 1 nan -262,500.00 375,304.00 199.00
CARTER REBECCA C 261,809.00 312.00 nan 477,557.00 nan 300,000.00 307,301.00 196.00 -307,301.00 nan nan nan 15.00 540.00 7.00 0 nan -159,792.00 75,000.00 29.00
CAUSEY RICHARD A 415,189.00 1,892.00 nan 1,868,758.00 nan 1,000,000.00 2,502,063.00 1,585.00 nan 2,502,063.00 30,674.00 nan 49.00 307,895.00 12.00 1 nan -235,000.00 350,000.00 58.00
CHAN RONNIE nan nan nan nan nan nan 32,460.00 nan -32,460.00 nan nan nan nan nan nan 0 98,784.00 -98,784.00 nan nan
CHRISTODOULOU DIOMEDES nan nan nan nan 5,127,155.00 nan 950,730.00 nan nan 6,077,885.00 nan nan nan nan nan 0 nan nan nan nan
CLINE KENNETH W nan nan nan nan nan nan 662,086.00 nan -472,568.00 189,518.00 nan nan nan nan nan 0 nan nan nan nan
COLWELL WESLEY 288,542.00 1,758.00 27,610.00 1,490,344.00 nan 1,200,000.00 698,242.00 1,132.00 nan 698,242.00 16,514.00 nan 40.00 101,740.00 11.00 1 nan -144,062.00 nan 240.00
CORDES WILLIAM R nan 764.00 nan nan 651,850.00 nan 386,335.00 58.00 nan 1,038,185.00 nan nan 12.00 nan 0.00 0 nan nan nan 10.00
COX DAVID 314,288.00 102.00 nan 1,101,393.00 117,551.00 800,000.00 378,082.00 71.00 nan 495,633.00 27,861.00 nan 33.00 494.00 4.00 0 nan -41,250.00 nan 0.00
CUMBERLAND MICHAEL S 184,899.00 nan nan 807,956.00 nan 325,000.00 207,940.00 nan nan 207,940.00 22,344.00 nan nan 713.00 nan 0 nan nan 275,000.00 nan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
SCRIMSHAW MATTHEW nan nan nan nan 759,557.00 nan nan nan nan 759,557.00 nan nan nan nan nan 0 nan nan nan nan
SHANKMAN JEFFREY A 304,110.00 3,221.00 nan 3,038,702.00 1,441,898.00 2,000,000.00 630,137.00 1,730.00 nan 2,072,035.00 178,979.00 nan 2,681.00 1,191.00 83.00 0 nan nan 554,422.00 94.00
SHAPIRO RICHARD S 269,076.00 15,149.00 nan 1,057,548.00 607,837.00 650,000.00 379,164.00 4,527.00 nan 987,001.00 137,767.00 nan 1,215.00 705.00 65.00 0 nan nan nan 74.00
SHARP VICTORIA T 248,146.00 3,136.00 187,469.00 1,576,511.00 281,073.00 600,000.00 213,063.00 2,477.00 nan 494,136.00 116,337.00 nan 136.00 2,401.00 6.00 0 nan nan 422,158.00 24.00
SHELBY REX 211,844.00 225.00 nan 2,003,885.00 1,624,396.00 200,000.00 869,220.00 91.00 nan 2,493,616.00 22,884.00 nan 39.00 1,573,324.00 14.00 1 nan -4,167.00 nan 13.00
SHERRICK JEFFREY B nan 613.00 nan nan 1,426,469.00 nan 405,999.00 583.00 nan 1,832,468.00 nan nan 25.00 nan 18.00 0 nan nan nan 39.00
SHERRIFF JOHN R 428,780.00 3,187.00 nan 4,335,388.00 1,835,558.00 1,500,000.00 1,293,424.00 2,103.00 nan 3,128,982.00 nan nan 92.00 1,852,186.00 23.00 0 nan nan 554,422.00 28.00
SKILLING JEFFREY K 1,111,258.00 3,627.00 nan 8,682,716.00 19,250,000.00 5,600,000.00 6,843,672.00 2,042.00 nan 26,093,672.00 29,336.00 nan 108.00 22,122.00 30.00 1 nan nan 1,920,000.00 88.00
STABLER FRANK 239,502.00 nan nan 1,112,087.00 nan 500,000.00 511,734.00 nan nan 511,734.00 16,514.00 nan nan 356,071.00 nan 0 nan nan nan nan
SULLIVAN-SHAKLOVITZ COLLEEN 162,779.00 nan 181,993.00 999,356.00 1,362,375.00 100,000.00 nan nan nan 1,362,375.00 nan nan nan 162.00 nan 0 nan nan 554,422.00 nan
SUNDE MARTIN 257,486.00 2,647.00 nan 1,545,059.00 nan 700,000.00 698,920.00 2,565.00 nan 698,920.00 nan nan 38.00 111,122.00 13.00 0 nan nan 476,451.00 37.00
TAYLOR MITCHELL S 265,214.00 533.00 227,449.00 1,092,663.00 3,181,250.00 600,000.00 563,798.00 300.00 nan 3,745,048.00 nan nan 29.00 nan 0.00 0 nan nan nan 0.00
THE TRAVEL AGENCY IN THE PARK nan nan nan 362,096.00 nan nan nan nan nan nan nan nan nan 362,096.00 nan 0 nan nan nan nan
THORN TERENCE H 222,093.00 266.00 16,586.00 911,453.00 4,452,476.00 nan 365,320.00 73.00 nan 4,817,796.00 46,145.00 nan 41.00 426,629.00 0.00 0 nan nan 200,000.00 0.00
TILNEY ELIZABETH A 247,338.00 460.00 nan 399,393.00 591,250.00 300,000.00 576,792.00 379.00 nan 1,168,042.00 nan nan 19.00 152,055.00 11.00 0 nan -575,000.00 275,000.00 10.00
UMANOFF ADAM S 288,589.00 111.00 nan 1,130,461.00 nan 788,750.00 nan 41.00 nan nan 53,122.00 nan 18.00 nan 0.00 0 nan nan nan 12.00
URQUHART JOHN A nan nan nan 228,656.00 nan nan nan nan nan nan 228,656.00 nan nan nan nan 0 36,666.00 -36,666.00 nan nan
WAKEHAM JOHN nan nan nan 213,071.00 nan nan nan nan nan nan 103,773.00 nan nan nan nan 0 109,298.00 nan nan nan
WALLS JR ROBERT H 357,091.00 671.00 nan 1,798,780.00 4,346,544.00 850,000.00 1,552,453.00 215.00 nan 5,898,997.00 50,936.00 nan 146.00 2.00 0.00 0 nan nan 540,751.00 17.00
WALTERS GARETH W nan nan 53,625.00 87,410.00 1,030,329.00 nan nan nan nan 1,030,329.00 33,785.00 nan nan nan nan 0 nan nan nan nan
WASAFF GEORGE 259,996.00 400.00 831,299.00 1,034,395.00 1,668,260.00 325,000.00 388,167.00 337.00 nan 2,056,427.00 nan nan 30.00 1,425.00 7.00 0 nan -583,325.00 200,000.00 22.00
WESTFAHL RICHARD K 63,744.00 nan nan 762,135.00 nan nan 384,930.00 nan nan 384,930.00 51,870.00 nan nan 401,130.00 nan 0 nan -10,800.00 256,191.00 nan
WHALEY DAVID A nan nan nan nan 98,718.00 nan nan nan nan 98,718.00 nan nan nan nan nan 0 nan nan nan nan
WHALLEY LAWRENCE G 510,364.00 6,019.00 nan 4,677,574.00 3,282,960.00 3,000,000.00 2,796,177.00 3,920.00 nan 6,079,137.00 57,838.00 nan 556.00 301,026.00 24.00 0 nan nan 808,346.00 186.00
WHITE JR THOMAS E 317,543.00 nan nan 1,934,359.00 1,297,049.00 450,000.00 13,847,074.00 nan nan 15,144,123.00 81,353.00 nan nan 1,085,463.00 nan 0 nan nan nan nan
WINOKUR JR. HERBERT S nan nan nan 84,992.00 nan nan nan nan nan nan 1,413.00 nan nan nan nan 0 108,579.00 -25,000.00 nan nan
WODRASKA JOHN nan nan nan 189,583.00 nan nan nan nan nan nan nan nan nan 189,583.00 nan 0 nan nan nan nan
WROBEL BRUCE nan nan nan nan 139,130.00 nan nan nan nan 139,130.00 nan nan nan nan nan 0 nan nan nan nan
YEAGER F SCOTT 158,403.00 nan nan 360,300.00 8,308,552.00 nan 3,576,206.00 nan nan 11,884,758.00 53,947.00 nan nan 147,950.00 nan 1 nan nan nan nan
YEAP SOON nan nan nan 55,097.00 192,758.00 nan nan nan nan 192,758.00 55,097.00 nan nan nan nan 0 nan nan nan nan

145 rows × 20 columns

Enron data point count: 145

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       94 non-null float64
to_messages                  86 non-null float64
deferral_payments            38 non-null float64
total_payments               124 non-null float64
exercised_stock_options      101 non-null float64
bonus                        81 non-null float64
restricted_stock             109 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    17 non-null float64
total_stock_value            125 non-null float64
expenses                     94 non-null float64
loan_advances                3 non-null float64
from_messages                86 non-null float64
other                        92 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          145 non-null int64
director_fees                16 non-null float64
deferred_income              48 non-null float64
long_term_incentive          65 non-null float64
from_poi_to_this_person      86 non-null float64
dtypes: float64(19), int64(1)
memory usage: 23.8+ KB

DataFrame description:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
count 94.00 86.00 38.00 124.00 101.00 81.00 109.00 86.00 17.00 125.00 94.00 3.00 86.00 92.00 86.00 145.00 16.00 48.00 65.00 86.00
mean 284,087.54 2,073.86 841,602.53 2,623,421.18 2,959,559.26 1,201,773.07 1,147,424.09 1,176.47 621,892.82 3,352,073.02 54,192.01 27,975,000.00 608.79 465,276.66 41.23 0.12 89,822.88 -581,049.81 746,491.20 64.90
std 177,131.12 2,582.70 1,289,322.63 9,488,105.53 5,499,449.60 1,441,679.44 2,249,770.36 1,178.32 3,845,528.35 6,532,883.10 46,108.38 46,382,560.03 1,841.03 1,389,719.06 100.07 0.33 41,112.70 942,076.40 862,917.42 86.98
min 477.00 57.00 -102,500.00 148.00 3,285.00 70,000.00 -2,604,490.00 2.00 -1,787,380.00 -44,093.00 148.00 400,000.00 12.00 2.00 0.00 0.00 3,285.00 -3,504,386.00 69,223.00 0.00
25% 211,802.00 541.25 79,644.50 386,380.25 506,765.00 425,000.00 252,055.00 249.75 -329,825.00 494,136.00 22,479.00 1,200,000.00 22.75 1,209.00 1.00 0.00 83,674.50 -611,209.25 275,000.00 10.00
50% 258,741.00 1,211.00 221,063.50 1,100,246.50 1,297,049.00 750,000.00 441,096.00 740.50 -140,264.00 1,095,040.00 46,547.50 2,000,000.00 41.00 51,984.50 8.00 0.00 106,164.50 -151,927.00 422,158.00 35.00
75% 308,606.50 2,634.75 867,211.25 2,084,662.75 2,542,813.00 1,200,000.00 985,032.00 1,888.25 -72,419.00 2,606,763.00 78,408.50 41,762,500.00 145.50 357,577.25 24.75 0.00 112,815.00 -37,926.00 831,809.00 72.25
max 1,111,258.00 15,149.00 6,426,990.00 103,559,793.00 34,348,384.00 8,000,000.00 14,761,694.00 5,521.00 15,456,290.00 49,110,078.00 228,763.00 81,525,000.00 14,368.00 10,359,729.00 609.00 1.00 137,864.00 -833.00 5,145,434.00 528.00
Columns in dataframe: 20.
Columns missing values: 19.

Missing values table:
Missing Values % of Total Values
loan_advances 142 97.90
director_fees 129 89.00
restricted_stock_deferred 128 88.30
deferral_payments 107 73.80
deferred_income 97 66.90
long_term_incentive 80 55.20
bonus 64 44.10
from_poi_to_this_person 59 40.70
shared_receipt_with_poi 59 40.70
to_messages 59 40.70
from_this_person_to_poi 59 40.70
from_messages 59 40.70
other 53 36.60
salary 51 35.20
expenses 51 35.20
exercised_stock_options 44 30.30
restricted_stock 36 24.80
total_payments 21 14.50
total_stock_value 20 13.80
Correlation Heatmap:
Label value counts:
0    127
1     18
Name: poi, dtype: int64

Feature correlations to (poi) feature:
Correlation Absolute Correlation
loan_advances 1.00 1.00
exercised_stock_options 0.50 0.50
total_stock_value 0.37 0.37
bonus 0.30 0.30
deferred_income -0.27 0.27
salary 0.26 0.26
long_term_incentive 0.25 0.25
total_payments 0.23 0.23
shared_receipt_with_poi 0.23 0.23
restricted_stock 0.22 0.22
from_poi_to_this_person 0.17 0.17
other 0.12 0.12
from_this_person_to_poi 0.11 0.11
deferral_payments -0.10 0.10
from_messages -0.07 0.07
expenses 0.06 0.06
to_messages 0.06 0.06
Feature Boxplots:
Feature Swarmplots:
Current features and labels shapes:
Enron labels shape: (145L,)
Enron features shape: (145L, 19L)

Selected features (with label):
['poi', 'total_payments', 'deferred_income', 'restricted_stock', 'expenses', 'total_stock_value', 'other', 'bonus', 'exercised_stock_options']

Feature Importances:
Current features and labels shapes:
Enron labels shape: (145L,)
Enron features shape: (145L, 8L)

Performing Model Optimizations...

Analyzing KMeans...
Fitting 100 folds for each of 16 candidates, totalling 1600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   31.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   34.8s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   41.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   49.8s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 1600 out of 1600 | elapsed:  1.2min finished

Best KMeans Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0))])

Best Estimator Overall: 0.4425
Best Estimator Accuracy: 0.8347
Best Estimator Recall: 0.0750
Best Estimator Precision: 0.0255
Analyzing RandomForestClassifier...
Fitting 100 folds for each of 2592 candidates, totalling 259200 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.8s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   11.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   20.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   33.6s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:   48.6s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  1.8min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:  4.5min
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:  5.1min
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:  5.7min
[Parallel(n_jobs=8)]: Done 14434 tasks      | elapsed:  6.5min
[Parallel(n_jobs=8)]: Done 16184 tasks      | elapsed:  7.2min
[Parallel(n_jobs=8)]: Done 18034 tasks      | elapsed:  8.0min
[Parallel(n_jobs=8)]: Done 19984 tasks      | elapsed:  9.0min
[Parallel(n_jobs=8)]: Done 22034 tasks      | elapsed:  9.8min
[Parallel(n_jobs=8)]: Done 24184 tasks      | elapsed: 10.9min
[Parallel(n_jobs=8)]: Done 26434 tasks      | elapsed: 11.8min
[Parallel(n_jobs=8)]: Done 28784 tasks      | elapsed: 13.0min
[Parallel(n_jobs=8)]: Done 31234 tasks      | elapsed: 14.0min
[Parallel(n_jobs=8)]: Done 33784 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 36434 tasks      | elapsed: 16.4min
[Parallel(n_jobs=8)]: Done 39184 tasks      | elapsed: 17.8min
[Parallel(n_jobs=8)]: Done 42034 tasks      | elapsed: 19.1min
[Parallel(n_jobs=8)]: Done 44984 tasks      | elapsed: 20.5min
[Parallel(n_jobs=8)]: Done 48034 tasks      | elapsed: 22.0min
[Parallel(n_jobs=8)]: Done 51184 tasks      | elapsed: 23.4min
[Parallel(n_jobs=8)]: Done 54434 tasks      | elapsed: 25.0min
[Parallel(n_jobs=8)]: Done 57784 tasks      | elapsed: 26.8min
[Parallel(n_jobs=8)]: Done 61234 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 64784 tasks      | elapsed: 30.0min
[Parallel(n_jobs=8)]: Done 68434 tasks      | elapsed: 31.9min
[Parallel(n_jobs=8)]: Done 72184 tasks      | elapsed: 33.7min
[Parallel(n_jobs=8)]: Done 76034 tasks      | elapsed: 35.4min
[Parallel(n_jobs=8)]: Done 79984 tasks      | elapsed: 37.1min
[Parallel(n_jobs=8)]: Done 84034 tasks      | elapsed: 38.9min
[Parallel(n_jobs=8)]: Done 88184 tasks      | elapsed: 40.7min
[Parallel(n_jobs=8)]: Done 92434 tasks      | elapsed: 42.8min
[Parallel(n_jobs=8)]: Done 96784 tasks      | elapsed: 44.8min
[Parallel(n_jobs=8)]: Done 101234 tasks      | elapsed: 46.8min
[Parallel(n_jobs=8)]: Done 105784 tasks      | elapsed: 49.0min
[Parallel(n_jobs=8)]: Done 110434 tasks      | elapsed: 51.2min
[Parallel(n_jobs=8)]: Done 115184 tasks      | elapsed: 53.3min
[Parallel(n_jobs=8)]: Done 120034 tasks      | elapsed: 55.6min
[Parallel(n_jobs=8)]: Done 124984 tasks      | elapsed: 57.8min
[Parallel(n_jobs=8)]: Done 130034 tasks      | elapsed: 60.1min
[Parallel(n_jobs=8)]: Done 135184 tasks      | elapsed: 62.4min
[Parallel(n_jobs=8)]: Done 140434 tasks      | elapsed: 64.8min
[Parallel(n_jobs=8)]: Done 145784 tasks      | elapsed: 67.2min
[Parallel(n_jobs=8)]: Done 151234 tasks      | elapsed: 69.6min
[Parallel(n_jobs=8)]: Done 156784 tasks      | elapsed: 72.2min
[Parallel(n_jobs=8)]: Done 162434 tasks      | elapsed: 74.8min
[Parallel(n_jobs=8)]: Done 168184 tasks      | elapsed: 77.5min
[Parallel(n_jobs=8)]: Done 174034 tasks      | elapsed: 80.2min
[Parallel(n_jobs=8)]: Done 179984 tasks      | elapsed: 82.9min
[Parallel(n_jobs=8)]: Done 186034 tasks      | elapsed: 85.7min
[Parallel(n_jobs=8)]: Done 192184 tasks      | elapsed: 88.6min
[Parallel(n_jobs=8)]: Done 198434 tasks      | elapsed: 91.4min
[Parallel(n_jobs=8)]: Done 204784 tasks      | elapsed: 94.3min
[Parallel(n_jobs=8)]: Done 211234 tasks      | elapsed: 97.3min
[Parallel(n_jobs=8)]: Done 217784 tasks      | elapsed: 100.3min
[Parallel(n_jobs=8)]: Done 224434 tasks      | elapsed: 103.2min
[Parallel(n_jobs=8)]: Done 231184 tasks      | elapsed: 106.4min
[Parallel(n_jobs=8)]: Done 238034 tasks      | elapsed: 109.5min
[Parallel(n_jobs=8)]: Done 244984 tasks      | elapsed: 112.8min
[Parallel(n_jobs=8)]: Done 252034 tasks      | elapsed: 115.9min
[Parallel(n_jobs=8)]: Done 259184 tasks      | elapsed: 119.3min
[Parallel(n_jobs=8)]: Done 259200 out of 259200 | elapsed: 119.3min finished

Best RandomForestClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', RandomForestClassifier(bootstrap=T...mators=16, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

Best Estimator Overall: 0.5661
Best Estimator Accuracy: 0.8660
Best Estimator Recall: 0.2150
Best Estimator Precision: 0.3173
Analyzing GaussianNB...
Fitting 100 folds for each of 16 candidates, totalling 1600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 736 tasks      | elapsed:    8.7s
[Parallel(n_jobs=8)]: Done 1600 out of 1600 | elapsed:   18.9s finished

Best GaussianNB Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', GaussianNB(priors=None, var_smoothing=1e-09))])

Best Estimator Overall: 0.6391
Best Estimator Accuracy: 0.8667
Best Estimator Recall: 0.3700
Best Estimator Precision: 0.4532
Analyzing AdaBoostClassifier...
Fitting 100 folds for each of 576 candidates, totalling 57600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    7.8s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   19.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   35.8s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   57.5s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  5.6min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:  7.8min
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:  8.6min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:  9.9min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed: 12.1min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed: 16.4min
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed: 18.7min
[Parallel(n_jobs=8)]: Done 14434 tasks      | elapsed: 23.0min
[Parallel(n_jobs=8)]: Done 16184 tasks      | elapsed: 24.5min
[Parallel(n_jobs=8)]: Done 18034 tasks      | elapsed: 27.6min
[Parallel(n_jobs=8)]: Done 19984 tasks      | elapsed: 31.4min
[Parallel(n_jobs=8)]: Done 22034 tasks      | elapsed: 34.8min
[Parallel(n_jobs=8)]: Done 24184 tasks      | elapsed: 42.3min
[Parallel(n_jobs=8)]: Done 26434 tasks      | elapsed: 46.2min
[Parallel(n_jobs=8)]: Done 28784 tasks      | elapsed: 55.1min
[Parallel(n_jobs=8)]: Done 31234 tasks      | elapsed: 59.1min
[Parallel(n_jobs=8)]: Done 33784 tasks      | elapsed: 67.9min
[Parallel(n_jobs=8)]: Done 36434 tasks      | elapsed: 72.2min
[Parallel(n_jobs=8)]: Done 39184 tasks      | elapsed: 80.1min
[Parallel(n_jobs=8)]: Done 42034 tasks      | elapsed: 81.3min
[Parallel(n_jobs=8)]: Done 44984 tasks      | elapsed: 82.5min
[Parallel(n_jobs=8)]: Done 48034 tasks      | elapsed: 83.9min
[Parallel(n_jobs=8)]: Done 51184 tasks      | elapsed: 85.2min
[Parallel(n_jobs=8)]: Done 54434 tasks      | elapsed: 86.6min
[Parallel(n_jobs=8)]: Done 57600 out of 57600 | elapsed: 88.0min finished

Best AdaBoostClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', None), ('reduce_dim', None), ('classify', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1,
          n_estimators=32, random_state=42))])

Best Estimator Overall: 0.5520
Best Estimator Accuracy: 0.8507
Best Estimator Recall: 0.2200
Best Estimator Precision: 0.2867
Analyzing SVC...
Fitting 100 folds for each of 240 candidates, totalling 24000 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  68 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 368 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done 868 tasks      | elapsed:   13.2s
[Parallel(n_jobs=8)]: Done 1568 tasks      | elapsed:   23.9s
[Parallel(n_jobs=8)]: Done 2468 tasks      | elapsed:   37.9s
[Parallel(n_jobs=8)]: Done 3568 tasks      | elapsed:   54.3s
[Parallel(n_jobs=8)]: Done 4868 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 6368 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 8068 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done 9968 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 12068 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 14368 tasks      | elapsed:  3.7min
[Parallel(n_jobs=8)]: Done 16868 tasks      | elapsed:  4.3min
[Parallel(n_jobs=8)]: Done 19568 tasks      | elapsed:  5.0min
[Parallel(n_jobs=8)]: Done 22423 tasks      | elapsed:  6.0min
[Parallel(n_jobs=8)]: Done 24000 out of 24000 | elapsed: 10.5min finished

Best SVC Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', None), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

Best Estimator Overall: 0.5258
Best Estimator Accuracy: 0.8740
Best Estimator Recall: 0.1300
Best Estimator Precision: 0.2250
Analyzing KNeighborsClassifier...
Fitting 100 folds for each of 960 candidates, totalling 96000 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 356 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done 856 tasks      | elapsed:   15.2s
[Parallel(n_jobs=8)]: Done 1556 tasks      | elapsed:   28.5s
[Parallel(n_jobs=8)]: Done 2456 tasks      | elapsed:   42.3s
[Parallel(n_jobs=8)]: Done 3556 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 4856 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 6356 tasks      | elapsed:  1.8min
[Parallel(n_jobs=8)]: Done 8056 tasks      | elapsed:  2.3min
[Parallel(n_jobs=8)]: Done 9956 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done 12056 tasks      | elapsed:  3.5min
[Parallel(n_jobs=8)]: Done 14356 tasks      | elapsed:  4.2min
[Parallel(n_jobs=8)]: Done 16856 tasks      | elapsed:  4.9min
[Parallel(n_jobs=8)]: Done 19556 tasks      | elapsed:  5.6min
[Parallel(n_jobs=8)]: Done 22456 tasks      | elapsed:  6.5min
[Parallel(n_jobs=8)]: Done 25556 tasks      | elapsed:  7.4min
[Parallel(n_jobs=8)]: Done 28856 tasks      | elapsed:  8.3min
[Parallel(n_jobs=8)]: Done 32356 tasks      | elapsed:  9.4min
[Parallel(n_jobs=8)]: Done 36056 tasks      | elapsed: 10.5min
[Parallel(n_jobs=8)]: Done 39956 tasks      | elapsed: 11.7min
[Parallel(n_jobs=8)]: Done 44056 tasks      | elapsed: 12.8min
[Parallel(n_jobs=8)]: Done 48356 tasks      | elapsed: 14.1min
[Parallel(n_jobs=8)]: Done 52856 tasks      | elapsed: 15.4min
[Parallel(n_jobs=8)]: Done 57556 tasks      | elapsed: 16.7min
[Parallel(n_jobs=8)]: Done 62456 tasks      | elapsed: 18.2min
[Parallel(n_jobs=8)]: Done 67556 tasks      | elapsed: 19.6min
[Parallel(n_jobs=8)]: Done 72856 tasks      | elapsed: 21.1min
[Parallel(n_jobs=8)]: Done 78356 tasks      | elapsed: 22.7min
[Parallel(n_jobs=8)]: Done 84056 tasks      | elapsed: 24.3min
[Parallel(n_jobs=8)]: Done 89956 tasks      | elapsed: 26.0min
[Parallel(n_jobs=8)]: Done 96000 out of 96000 | elapsed: 27.7min finished

Best KNeighborsClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', None), ('reduce_dim', None), ('classify', KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=1,
           weights='distance'))])

Best Estimator Overall: 0.5574
Best Estimator Accuracy: 0.8607
Best Estimator Recall: 0.2200
Best Estimator Precision: 0.2883
Analyzing DecisionTreeClassifier...
Fitting 100 folds for each of 384 candidates, totalling 38400 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 736 tasks      | elapsed:    9.0s
[Parallel(n_jobs=8)]: Done 1736 tasks      | elapsed:   22.3s
[Parallel(n_jobs=8)]: Done 3136 tasks      | elapsed:   39.3s
[Parallel(n_jobs=8)]: Done 4936 tasks      | elapsed:   59.8s
[Parallel(n_jobs=8)]: Done 7136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 9736 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done 12736 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 16136 tasks      | elapsed:  3.2min
[Parallel(n_jobs=8)]: Done 19936 tasks      | elapsed:  4.0min
[Parallel(n_jobs=8)]: Done 24136 tasks      | elapsed:  4.8min
[Parallel(n_jobs=8)]: Done 28736 tasks      | elapsed:  5.7min
[Parallel(n_jobs=8)]: Done 33736 tasks      | elapsed:  6.7min
[Parallel(n_jobs=8)]: Done 38400 out of 38400 | elapsed:  7.6min finished

Best DecisionTreeClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', None), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
 ...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])

Best Estimator Overall: 0.6096
Best Estimator Accuracy: 0.8547
Best Estimator Recall: 0.3550
Best Estimator Precision: 0.3742
Total training time: 15288.313 s

Best Overall Results:
Best Estimator Overall: 0.6391
Best Estimator Accuracy: 0.8667
Best Estimator Recall: 0.3700
Best Estimator Precision: 0.4532

Best Overall Estimator Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_eosqif',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', GaussianNB(priors=None, var_smoothing=1e-09))])
In [3]:
#!/usr/bin/python

import sys
from time import time
import math
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
from pandas import DataFrame
from numpy.lib.function_base import average
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from backports import tempfile
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import make_scorer
from tester import dump_classifier_and_data
sys.path.append('../tools/')


def load_data(file_path):
    '''
    Retrieve the dataset stored in the specific pickle file path.

    Args:
        file_path : string
            The absolute file path for the pickle file containing the data.

    Returns:
        dataset : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.
    '''
    # Load the dictionary containing the dataset
    file = open(file_path, 'r')
    dataset = pickle.load(file)
    file.close()

    return dataset


def get_clean_enron_dataframe(enron_data):
    '''
    Performs cleaning operations on the enron_data_frame.

    Args:
        enron_data : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.

    Returns:
        enron_data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            pandas format, after cleaning the data.
    '''
    pd.options.display.float_format = '{:20,.2f}'.format
    enron_data_frame = DataFrame.from_dict(enron_data, orient='index')
    # Drop unwanted columns.
    enron_data_frame.drop('TOTAL', axis=0, inplace=True)
    enron_data_frame.drop('email_address', axis=1, inplace=True)
    # All NaN strings are converted to Numpy nan values, which allows the
    # describe function to produce proper numeric values for all statistics.
    enron_data_frame.replace('NaN', np.NaN, regex=True, inplace=True)
    # Convert True to 1 and False to 0.
    enron_data_frame.replace({True: 1, False: 0}, inplace=True)

    return enron_data_frame


def print_missing_values_table(data_frame):
    '''
    Generate a series of statistics for each one of the features found in the
    dataframe in order to understand better the data.

    Adapted from:
    https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction

    Args:
        data_frame : DataFrame
            DataFrame we want to inspect for columns with missing values.

    Returns:
        missing_values_table : DataFrame
            DataFrame containing the missing values statistics for the
            data_frame columns.
    '''
    missing_values = data_frame.isna().sum()
    missing_values_percentage = 100 * missing_values / len(data_frame)
    missing_values_table = pd.concat([missing_values,
                                      missing_values_percentage], axis=1)
    # Rename the columns.
    missing_values_table.columns = ['Missing Values', '% of Total Values']
    # Leave on the table only the columns that are missing values.
    columns_missing_values = missing_values_table.iloc[:, 1] != 0
    missing_values_table = missing_values_table[columns_missing_values]
    # Sort the table by percentage of missing descending.
    missing_values_table = missing_values_table.sort_values(
                                '% of Total Values', ascending=False).round(1)
    # Print some summary information.
    print('\nColumns in dataframe: {}.'.format(data_frame.shape[1]))
    print('Columns missing values: {}.'.format(missing_values_table.shape[0]))
    print('\nMissing values table:')
    display(missing_values_table)

    return missing_values_table


def print_target_correlation_report(correlations_table, label_column_name):
    '''
    Generate a report for the most positive and most negative feature
    correlations with the target feature.

    Args:
        data_frame : DataFrame
            DataFrame we want to show correlations with the target feature for.
        correlations_table : DataFrame
            DataFrame containing the correlations between all data features.
        label_column : string
            The name of the column containing the labels for each data point in
            the DataFrame.

    Returns:
        None
    '''
    target_correlations = correlations_table[label_column_name]
    absolute_target_correlations = abs(target_correlations)
    target_correlations_table = pd.concat([target_correlations,
                                           absolute_target_correlations],
                                          axis=1)
    # Rename the columns.
    target_correlations_table.columns = ['Correlation', 'Absolute Correlation']
    # Leave only the features that aren't the target or have a nan value.
    target_correlations_table.drop(label_column_name, axis=0, inplace=True)
    correlation_features = pd.notnull(target_correlations_table.iloc[:, 1])
    target_correlations_table = target_correlations_table[correlation_features]
    # Sort the table by percentage of missing descending.
    target_correlations_table.sort_values('Absolute Correlation',
                                          ascending=False, inplace=True)
    print('\nFeature correlations to ({}) feature:'.format(label_column_name))
    display(target_correlations_table)


def display_correlation_heatmap(data_frame):
    '''
    Generate a table and heatmap to allow visualization of the correlations
    between input features in the dataset.

    Adapted from:
    https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction

    Args:
        data_frame : DataFrame
            DataFrame we want to show correlations for.

    Returns:
        correlations_table : DataFrame
            DataFrame containing the correlations between all data features.
    '''
    correlations_table = data_frame.corr()
    print('\nCorrelation Heatmap:')
    plt.figure(figsize=(16, 12))
    sns.heatmap(correlations_table, cmap='Blues', annot=True)
    plt.title('Correlation Heatmap')
    plt.show()

    return correlations_table


def describe_dataset(data_frame, label_column_name):
    '''
    Generate a series of statistics for each one of the features found in the
    dataframe in order to understand better the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.

    Returns:
        None
    '''
    print('\nDataFrame head:')
    display(data_frame)
    print('\nEnron data point count: {}'.format(len(data_frame)))
    print('\nDataFrame info:')
    data_frame.info()
    print('\nDataFrame description:')
    display(data_frame.describe())
    print_missing_values_table(data_frame)
    correlations_table = display_correlation_heatmap(data_frame)
    print('\nLabel value counts:\n{}'.format(
                                data_frame[label_column_name].value_counts()))
    print_target_correlation_report(correlations_table, label_column_name)


def draw_features_boxplots(data_frame, label_column_name, plot_columns):
    '''
    Generate a box plot for each one of the features in a dataframe, in order
    to visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    # Separate labels from features for easier plotting.
    labels = data_frame[label_column_name]
    data = data_frame.drop(label_column_name, axis=1)
    # Get the total columns in data, divide it by plot_columns and round it up
    # to get the rows we need to accommodate all features in plot_columns.
    plot_rows = int(math.ceil(float(data.shape[1]) / plot_columns))
    plot_height = plot_rows * 4
    _, axes = plt.subplots(plot_rows, plot_columns, figsize=(16, plot_height))
    figure_count = 0
    print('\nFeature Boxplots:')
    for column in data.columns:
        # Create a dataframe for plotting, with labels and the current column.
        plot_data = pd.concat([labels, data.loc[:, column]], axis=1)
        # Transform the dataframe to the required format using melt.
        plot_data = pd.melt(plot_data, id_vars=label_column_name,
                            var_name=column, value_name='value')
        figure_row = figure_count / plot_columns
        figure_col = figure_count % plot_columns
        figure_count += 1
        ax = axes[figure_row, figure_col]
        sns.boxplot(ax=ax, data=plot_data, hue=label_column_name, x=column,
                    y='value')
        ax.set_xlabel('')
        ax.set_ylabel('')

    plt.show()


def draw_features_swarmplots(data_frame, label_column_name, plot_columns):
    '''
    Generate a swarm plot for each one of the features in a dataframe, in order
    to visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    # Separate labels from features for easier plotting.
    labels = data_frame[label_column_name]
    data = data_frame.drop(label_column_name, axis=1)
    # Get the total columns in data, divide it by plot_columns and round it up
    # to get the rows we need to accommodate all features in plot_columns.
    plot_rows = int(math.ceil(float(data.shape[1]) / plot_columns))
    plot_height = plot_rows * 4
    _, axes = plt.subplots(plot_rows, plot_columns, figsize=(16, plot_height))
    figure_count = 0
    print('\nFeature Swarmplots:')
    for column in data.columns:
        # Create a dataframe for plotting, with labels and the current column.
        plot_data = pd.concat([labels, data.loc[:, column]], axis=1)
        # Transform the dataframe to the required format using melt.
        plot_data = pd.melt(plot_data, id_vars=label_column_name,
                            var_name=column, value_name='value')
        figure_row = figure_count / plot_columns
        figure_col = figure_count % plot_columns
        figure_count += 1
        ax = axes[figure_row, figure_col]
        sns.swarmplot(ax=ax, data=plot_data, hue=label_column_name, x=column,
                    y='value')
        ax.set_xlabel('')
        ax.set_ylabel('')

    plt.show()


def plot_features(data_frame, label_column_name, plot_columns):
    '''
    Generate a graphic for each one of the features in a dataframe, in order to
    visualize and help detect easily any outliers present on the data.

    Args:
        data_frame : DataFrame
            DataFrame containing the data stored in the file, in a structured
            format.
        label_column_name : string
            The name of the column containing the labels for each data point in
            the DataFrame.
        plot_columns : integer
            Number of feature plots to display per row.

    Returns:
        None
    '''
    draw_features_boxplots(data_frame, label_column_name, plot_columns)
    draw_features_swarmplots(data_frame, label_column_name, plot_columns)
    # print('\nFeature plots:')
    # plt.figure(figsize=(32, 24))
    # sns.pairplot(features, hue=labels)
    # plt.title('Correlation Heatmap')
    # plt.show()


def get_enron_feature_list():
    '''
    Retrieve the feature list to be used for the Enron POI classification
    problem:

    Financial features (all units are in US dollars):
        salary, deferral_payments, total_payments, loan_advances, bonus,
        restricted_stock_deferred, deferred_income, total_stock_value,
        expenses, exercised_stock_options, other, long_term_incentive,
        restricted_stock, director_fees
    Email features ('email_address' is string, the rest, email message counts):
        email_address, to_messages, from_poi_to_this_person, from_messages,
        from_this_person_to_poi, shared_receipt_with_poi
    POI label (boolean, represented as integer):
        poi

    Args:
        None

    Returns:
        features_list : list
            The list of features that will be used for solving the POI
            classification problem.
    '''
    # The first feature must be 'poi'.
    features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
                     'loan_advances', 'bonus', 'restricted_stock_deferred',
                     'deferred_income', 'total_stock_value', 'expenses',
                     'exercised_stock_options', 'other', 'long_term_incentive',
                     'restricted_stock', 'director_fees', 'to_messages',
                     'from_poi_to_this_person', 'from_messages',
                     'from_this_person_to_poi', 'shared_receipt_with_poi']

    return features_list


def get_labels_features(data_dictionary, feature_list):
    """
    Retrieve the labels and features for the given dataset, after applying
    some arranging and cleaning operations:
    - Keys (record IDs) are sorted by alphabetical order.
    - NaN strings are converted to 0.0.
    - Data points where all features have a value of zero are removed.

    Note that the first feature is assumed to be the label feature and is not
    used for determining if the data point should be removed or not.

    Args:
        data_dictionary : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.
        feature_list : list
            The list of features that needs to be extracted from the dictionary
            and returned for the classification problem. The first feature on
            the list needs to contain the data labels.

    Returns:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        features : ndarray
            Array with the features for each data point in the dataset.
    """
    labels = []
    features = []
    keys = sorted(data_dictionary.keys())
    for key in keys:
        data_point_values = []
        # Get the data point values in a list.
        for feature in feature_list:
            try:
                data_dictionary[key][feature]
            except KeyError:
                print('Error: key {} not present'.format(feature))

            value = data_dictionary[key][feature]
            if value == 'NaN':
                value = 0
            data_point_values.append(float(value))

        # Logic for deciding whether or not to add the data point. The first
        # feature is assumed to be the label feature, and is not considered.
        label_value = data_point_values[0]
        feature_values = data_point_values[1:]
        for value in feature_values:
            if value != 0 and value != 'NaN':
                labels.append(np.array(label_value))
                features.append(np.array(feature_values))
                break

    labels = np.array(labels)
    features = np.array(features)
    print('\nCurrent features and labels shapes:')
    print('Enron labels shape: {}'.format(labels.shape))
    print('Enron features shape: {}'.format(features.shape))

    return labels, features


def get_best_enron_features(labels, features, feature_list, top_n_features):
    '''
    Select the best features to use automatically in a classification problem,
    by using the RandomForestClassifier feature importances.

    Args:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        features : ndarray
            Array with the features for each data point in the dataset.
        feature_list : list
            The list of features that needs to be extracted from the dictionary
            and returned. The first feature is expected to one with the labels.
        top_n_features : integer
            Is the number of features that will be selected from the original
            dataset, according to their importance.

    Returns:
        best_features_list : list
            The list of the best features that will be used for solving the POI
            classification problem.
    '''
    model = RandomForestClassifier(n_estimators=500, n_jobs=8, random_state=42)
    model.fit(features, labels)
    importances = model.feature_importances_
    indices = np.argsort(importances)
    sorted_features = [feature_list[1:][index] for index in indices]
    best_features_list = [feature_list[0]]
    best_features_list.extend(sorted_features[-top_n_features:])
    print('\nSelected features (with label):\n{}'.format(best_features_list))

    # plotting feature importances
    print('\nFeature Importances:')
    plt.figure(figsize=(16, 12))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b')
    plt.yticks(range(len(indices)), sorted_features)
    plt.xlabel('Relative Importance')
    plt.show()

    return best_features_list


def remove_enron_outliers(enron_data):
    '''
    Return the labels and features for the Enron dataset, after eliminating the
    outlier data points from the different features.

    Args:
        enron_data : Dictionary
            Dictionary containing the data stored in the file, in a structured
            format.

    Returns:
        enron_data : Dictionary
            Dictionary containing the data after removing the outliers.

    '''
    negatives_removal_features = ['deferral_payments', 'restricted_stock',
                                  'total_stock_value']
    keys = sorted(enron_data.keys())
    removed_outliers = 0
    for key in keys:
        for feature in negatives_removal_features:
            try:
                value = enron_data[key][feature]
                if value < 0:
                    enron_data[key][feature] = 0
                    removed_outliers += 1
            except KeyError:
                print('Error: key {} not present'.format(feature))

    print('\nOutlier features:\n{}'.format(negatives_removal_features))
    print('Total outliers removed:\n{}'.format(removed_outliers))

    return enron_data


def add_enron_features(labels, features):
    '''
    Return the labels and features for the Enron dataset, after adding new
    relevant features to help improve the classification performance.

    Args:
        labels : ndarray
            Array with the labels for each data point in the enron dataset.
        features : ndarray
            Array with the features for each data point in the enron dataset.

    Returns:
        labels : ndarray
            Array with the labels for each data point, after adding the new
            features.
        features : ndarray
            Array with the features for each data point, after adding the new
            features.
    '''

    return labels, features


def get_pipelines_definitions():
    '''
    Define the different pipelines that will be used to train and finetune the
    classification model.

    Args:
        None

    Returns:
        pipelines : Dictionary
            A dictionary containing all the pipelines that will be used to fit
            the model in order to select the one that produces the best results
            for the given problem.
    '''
    scale_variations = [None, RobustScaler(), MinMaxScaler(), Normalizer()]
    reduce_dim_variations = [None, PCA(2), PCA(3), PCA(4)]
    pipelines = {
        'GaussianNB': [{
            'classify': [GaussianNB()],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations
        }],
        'DecisionTreeClassifier': [{
            'classify': [DecisionTreeClassifier(random_state=42)],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__criterion': ['entropy', 'gini'],
            'classify__splitter': ['best', 'random'],
            'classify__min_samples_split': [2, 4, 8, 16, 32, 64]
        }],
        # I wasn't able to make SVC work with the 'linear' or 'poly' kernels.
        'SVC': [{
                    'classify': [SVC(random_state=42)],
                    'scale': scale_variations,
                    'reduce_dim': reduce_dim_variations,
                    'classify__kernel': ['rbf'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000],
                }, {
                    'classify': [SVC(random_state=42)],
                    'scale': scale_variations,
                    'reduce_dim': reduce_dim_variations,
                    'classify__kernel': ['sigmoid'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000]
                }, {
                    'classify': [SVC(random_state=42)],
                    # With other scalers the search won't finish.
                    'scale': [None, MinMaxScaler()],
                    # With values over 6 (8, 16, None) the search won't finish.
                    'reduce_dim': [PCA(2), PCA(4)],
                    'classify__kernel': ['poly'],
                    'classify__gamma': ['auto', 'scale'],
                    'classify__C': [10, 100, 1000],
                    # With a value of 2 or 3 the search won't finish.
                    'classify__degree': [4, 5]
            }],
        'KNeighborsClassifier': [{
            'classify': [KNeighborsClassifier()],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__n_neighbors': [2, 4, 8, 16, 32],
            'classify__weights': ['uniform', 'distance'],
            'classify__algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'classify__p': [1, 2]
        }],
        'RandomForestClassifier': [{
            'classify': [RandomForestClassifier(random_state=42)],
            'scale': scale_variations,
            'reduce_dim': reduce_dim_variations,
            'classify__n_estimators': [4, 8, 16],
            'classify__criterion': ['entropy', 'gini'],
            'classify__min_samples_split': [4, 8, 16],
            'classify__max_depth': [4, 8, 16],
            'classify__max_features': [None, 'sqrt', 'log2']
        }],
        'AdaBoostClassifier': [{
                'classify': [AdaBoostClassifier(random_state=42)],
                'scale': scale_variations,
                'reduce_dim': reduce_dim_variations,
                'classify__base_estimator': [
                    None,
                    SVC(kernel='poly', gamma='scale', degree=5),
                    DecisionTreeClassifier(splitter='random')
                ],
                'classify__n_estimators': [32, 64, 128],
                'classify__algorithm': ['SAMME'],
                'classify__learning_rate': [0.05, 0.1, 0.3, 1]
        }],
        'KMeans': [{
                'classify': [KMeans(random_state=42)],
                'scale': scale_variations,
                'reduce_dim': reduce_dim_variations,
                'classify__n_clusters': [2]
        }]
    }

    return pipelines


def get_dummy_pipeline_with_memory():
    '''
    Return a pipeline to be used in a search strategy (e.g. GridSearchCV,
    RandomSearchCV, etc.), with the correct steps in the right sequence, but
    initialized with arbitrary estimators (because the specific estimators
    to use in the search will be defined by means of the param_grid).

    The returned pipeline uses memory to improve search performance.

    Args:
        None

    Returns:
        pipeline : Pipeline
            A Pipeline object with the desired steps in the proper sequence,
            but initialized with arbitrary estimators, and with memory usage
            enabled.
    '''
    with tempfile.TemporaryDirectory(prefix='poi_id_') as tmpdir:
        # The steps used are just for initializing the pipeline. The actual
        # steps are defined inside the param_grid.
        pipeline = Pipeline(steps=[('scale', RobustScaler()),
                                   ('reduce_dim', PCA()),
                                   ('classify', GaussianNB())],
                            memory=tmpdir)

    return pipeline


def get_best_estimator_metrics(results, metrics):
    '''
    Process the search results DataFrame and extract from it the metrics for
    the best estimator.

    Args:
        results : DataFrame
            DataFrame with the results of the grid search.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search. The first metric in the list is
            assumed to be the main metric, which was used to select the best
            estimator.

    Returns:
        estimator_metrics : list
            List containing the best estimator's values for the metrics
            evaluated during the search.
    '''
    estimator_metrics = []
    best_estimator_string = 'Best Estimator {}: {:.4f}'

    main_metric_name = 'mean_test_' + metrics[0]
    main_metric_results = results[main_metric_name]
    main_metric_value = max(main_metric_results)
    main_metric_index = np.argmax(main_metric_results, axis=0)
    print(best_estimator_string.format(metrics[0].title(), main_metric_value))
    estimator_metrics.append(main_metric_value)

    for metric in metrics[1:]:
        full_metric_name = 'mean_test_' + metric
        metric_results = results[full_metric_name]
        metric_value = metric_results[main_metric_index]
        print(best_estimator_string.format(metric.title(), metric_value))
        estimator_metrics.append(metric_value)

    return estimator_metrics


def add_best_metric_value_marker(results, axe, x_values, metric, color):
    '''
    For a metric, plot a dotted vertical line marked with an x at the best
    score obtained, and annotate it with the value for that score.

    Args:
        results : DataFrame
            DataFrame with the results of the grid search.
        axe : Axes
            Axe where we'll plot the dotted vertical line.
        x_values : ndarray
            Array with the values used for the chart's X axis.
        metric : string
            The name of the metric whose best value we want to mark.
        color : string
            The code of the color we want to mark the best value with.

    Returns:
        None
    '''
    best_index = np.nonzero(results['rank_test_%s' % metric] == 1)[0][0]
    best_score = results['mean_test_%s' % metric][best_index]
    axe.plot([x_values[best_index], ] * 2, [0, best_score], linestyle='-.',
             color=color, marker='x', markeredgewidth=3, ms=8)
    axe.annotate('%0.2f' % best_score,
                 (x_values[best_index], best_score + 0.005))


def plot_estimator_metrics(estimator, metrics, results):
    '''
    Generate a graphic graphic comparing the results obtained for each one of
    the different candidates, for each one of the different scoring metrics
    used for the estimator search.

    Args:
        estimator : string
            The name of the estimator whose results are going to be plotted.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search. The first metric in the list is
            assumed to be the main metric, which was used to select the best
            estimator.
        results : DataFrame
            DataFrame with the results of the estimator's grid search.

    Returns:
        None
    '''
    # TODO explore with a pivot table in pandas that gets the average metric
    # score (for all metrics evaluated) for each value of a parameter. This
    # could be then plotted to see the impact of the specific parameter on
    # the results. Iterating over the different parameters we would end up
    # with a group of charts (one per parameter) to detect those parameters
    # most important for solving the particular problem.
    main_metric_name = 'mean_test_' + metrics[0]
    data_points = len(results[main_metric_name])
    x_values = np.arange(data_points)
    plt.figure(figsize=(20, 10))
    plt.title('Results for ' + estimator, fontsize=16)
    plt.xlabel('Candidates')
    plt.ylabel('Score')
    axe = plt.gca()
    axe.set_xlim(0, data_points - 1)
    axe.set_ylim(0.0, 1.0)

    for metric, color in zip(sorted(metrics), ['g', 'k', 'b', 'r']):
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, metric)]
            sample_score_std = results['std_%s_%s' % (sample, metric)]
            axe.fill_between(x_values, sample_score_mean - sample_score_std,
                             sample_score_mean + sample_score_std,
                             alpha=0.1 if sample == 'test' else 0, color=color)
            axe.plot(x_values, sample_score_mean, style, color=color,
                     alpha=1 if sample == 'test' else 0.7,
                     label='%s (%s)' % (metric, sample))

        add_best_metric_value_marker(results, axe, x_values, metric, color)

    plt.legend(loc='best')
    plt.grid(False)
    plt.show()


def get_best_estimator(features, labels, pipelines, cv_strategy, metrics):
    '''
    Get the best estimator from the pipelines, cross validation, metrics and
    refit metrics specified for the search strategy.

    Args:
        features : ndarray
            Array with the features for each data point in the enron dataset.
        labels : ndarray
            Array with the labels for each data point in the enron dataset.
        pipelines : Dictionary
            Dictionary with specification of the different pipelines we want to
            use to try and solve this particular problem.
        cv_strategy : cross-validation generator
            Method from the model_selection package that defines a cross
            validation strategy to be used for this particular problem.
        metrics : Dictionary
            Dictionary containing the names of the different metrics we want to
            measure for each one of the evaluated estimators. The first metric
            in the Dictionary is assumed to be the main metric to use for
            choosing the best estimator.

    Returns:
        best_results : DataFrame
            DataFrame with the best results of the grid search.
        best_estimator : Object
            This is the best estimator that was found during the search.
    '''
    metric_names = list(metrics.keys())
    print('\nPerforming Model Optimizations...')
    best_main_metric_value = -1.0
    best_estimator = ''
    best_results = ''
    pipeline = get_dummy_pipeline_with_memory()
    for estimator, pipeline_definition in pipelines.items():
        print('\nAnalyzing {}...'.format(estimator))
        clf = GridSearchCV(pipeline, param_grid=pipeline_definition,
                           cv=cv_strategy, scoring=metrics,
                           refit=metric_names[0], iid=False,
                           n_jobs=8, verbose=True, error_score='raise',
                           return_train_score=True)
        clf.fit(features, labels)
        results = clf.cv_results_
        print('\nBest {} Found:\n{}\n'.format(estimator, clf.best_estimator_))
        best_estimator_metrics = get_best_estimator_metrics(results,
                                                            metric_names)
        plot_estimator_metrics(estimator, metric_names, results)
        if best_estimator_metrics[0] > best_main_metric_value:
            best_estimator = clf.best_estimator_
            best_results = results
            best_main_metric_value = best_estimator_metrics[0]

    return best_results, best_estimator


def custom_score(labels, predictions):
    '''
    Calculate the score for the predictions, based on the labels passed to the
    function, using a combination of accuracy, recall and precision, in an
    attempt to get a model with a good accuracy and good enough precision and
    recall values.

    Args:
        labels : ndarray
            Array with the labels for each data point in the dataset.
        predictions : ndarray
            Array with the predictions for each data point in the dataset.

    Returns:
        total_score : double
            The score assigned to the model, given the predictions.
    '''
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    # accuracy is included twice in the average, to give increase its weight.
    total_score = average([accuracy, accuracy, precision, recall])

    return total_score


def print_overall_results(start_time, results, metrics, best_estimator):
    '''
    Print the best estimator with the respective metrics and other information.

    Args:
        start_time : float
            The time when the search process started.
        results : DataFrame
            DataFrame with the results of the estimator's grid search.
        metrics : list
            List containing the names of the metrics evaluated for the
            estimator during the search.
        best_estimator : string
            The definition of the best estimator found.

    Returns:
        None
    '''
    training_time = round(time() - start_time, 3)
    print('\nTotal training time: {} s'.format(training_time))
    print('\nBest Overall Results:')
    get_best_estimator_metrics(results, list(metrics.keys()))
    print('\nBest Overall Estimator Found:\n{}'.format(best_estimator))


# Task 0: Load and explore the dataset and features.
enron_data = load_data('final_project_dataset.pkl')
enron_data_frame = get_clean_enron_dataframe(enron_data)
describe_dataset(enron_data_frame, 'poi')
plot_features(enron_data_frame, 'poi', 3)

# Task 1: Remove outliers
enron_data = remove_enron_outliers(enron_data)

# Task 2: Select what features you'll use.
full_enron_feature_list = get_enron_feature_list()
labels, features = get_labels_features(enron_data, full_enron_feature_list)
enron_feature_list = get_best_enron_features(labels, features,
                                             full_enron_feature_list, 8)
labels, features = get_labels_features(enron_data, enron_feature_list)
labels, features = add_enron_features(labels, features)

# TODO Task 3: Create new feature(s)

# Task 4: Try a variety of classifiers
# Please name your classifier clf for easy export below.
# Note that if you want to do PCA or other multi-stage operations,
# you'll need to use Pipelines. For more info:
# http://scikit-learn.org/stable/modules/pipeline.html
pipelines = get_pipelines_definitions()

# Task 5: Tune your classifier to achieve better than .3 precision and recall
# using our testing script. Check the tester.py script in the final project
# folder for details on the evaluation method, especially the test_classifier
# function. Because of the small dataset size, the test script uses stratified
# shuffle split cross validation, so that's what we'll use here as well.
# For more info:
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
cv_strategy = StratifiedShuffleSplit(n_splits=100, random_state=42)
# We define all the scoring metrics we want to measure. Recall will be the one
# used to select the best set of parameters, and refit the identifier, because
# in this case false positives are far better than false negatives, since we
# don't want to risk missing ani pois. Recall needs to be the first metric on
# the list, because get_best_estimator assumes the one in that position to be
# the main metric to evaluate the select estimator.
start_time = time()
# To guarantee dictionary order, we pass an iterable of key-value pairs.
metrics = OrderedDict([
    ('overall', make_scorer(custom_score)),
    ('accuracy', 'accuracy'),
    ('recall', 'recall'),
    ('precision', 'precision'),
])
results, best_estimator = get_best_estimator(features, labels, pipelines,
                                             cv_strategy, metrics)
print_overall_results(start_time, results, metrics, best_estimator)

# TODO fix this. ¿Maybe refit is needed here before getting results?
# results = DataFrame.from_dict(best_estimator.cv_results_)
# results.head()

# Task 6: Dump your classifier, dataset, and features_list so anyone can check
# your results. You do not need to change anything below, but make sure that
# the version of poi_id.py that you submit can be run on its own and generates
# the necessary .pkl files for validating your results.
dump_classifier_and_data(best_estimator, enron_data, enron_feature_list)
DataFrame head:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
ALLEN PHILLIP K 201,955.00 2,902.00 2,869,717.00 4,484,442.00 1,729,541.00 4,175,000.00 126,027.00 1,407.00 -126,027.00 1,729,541.00 13,868.00 nan 2,195.00 152.00 65.00 0 nan -3,081,055.00 304,805.00 47.00
BADUM JAMES P nan nan 178,980.00 182,466.00 257,817.00 nan nan nan nan 257,817.00 3,486.00 nan nan nan nan 0 nan nan nan nan
BANNANTINE JAMES M 477.00 566.00 nan 916,197.00 4,046,157.00 nan 1,757,552.00 465.00 -560,222.00 5,243,487.00 56,301.00 nan 29.00 864,523.00 0.00 0 nan -5,104.00 nan 39.00
BAXTER JOHN C 267,102.00 nan 1,295,738.00 5,634,343.00 6,680,544.00 1,200,000.00 3,942,714.00 nan nan 10,623,258.00 11,200.00 nan nan 2,660,303.00 nan 0 nan -1,386,055.00 1,586,055.00 nan
BAY FRANKLIN R 239,671.00 nan 260,455.00 827,696.00 nan 400,000.00 145,796.00 nan -82,782.00 63,014.00 129,142.00 nan nan 69.00 nan 0 nan -201,641.00 nan nan
BAZELIDES PHILIP J 80,818.00 nan 684,694.00 860,136.00 1,599,641.00 nan nan nan nan 1,599,641.00 nan nan nan 874.00 nan 0 nan nan 93,750.00 nan
BECK SALLY W 231,330.00 7,315.00 nan 969,068.00 nan 700,000.00 126,027.00 2,639.00 nan 126,027.00 37,172.00 nan 4,343.00 566.00 386.00 0 nan nan nan 144.00
BELDEN TIMOTHY N 213,999.00 7,991.00 2,144,013.00 5,501,630.00 953,136.00 5,249,999.00 157,569.00 5,521.00 nan 1,110,705.00 17,355.00 nan 484.00 210,698.00 108.00 1 nan -2,334,434.00 nan 228.00
BELFER ROBERT nan nan -102,500.00 102,500.00 3,285.00 nan nan nan 44,093.00 -44,093.00 nan nan nan nan nan 0 3,285.00 nan nan nan
BERBERIAN DAVID 216,582.00 nan nan 228,474.00 1,624,396.00 nan 869,220.00 nan nan 2,493,616.00 11,892.00 nan nan nan nan 0 nan nan nan nan
BERGSIEKER RICHARD P 187,922.00 383.00 nan 618,850.00 nan 250,000.00 659,249.00 233.00 nan 659,249.00 59,175.00 nan 59.00 427,316.00 0.00 0 nan -485,813.00 180,250.00 4.00
BHATNAGAR SANJAY nan 523.00 nan 15,456,290.00 2,604,490.00 nan -2,604,490.00 463.00 15,456,290.00 nan nan nan 29.00 137,864.00 1.00 0 137,864.00 nan nan 0.00
BIBI PHILIPPE A 213,625.00 1,607.00 nan 2,047,593.00 1,465,734.00 1,000,000.00 378,082.00 1,336.00 nan 1,843,816.00 38,559.00 nan 40.00 425,688.00 8.00 0 nan nan 369,721.00 23.00
BLACHMAN JEREMY M 248,546.00 2,475.00 nan 2,014,835.00 765,313.00 850,000.00 189,041.00 2,326.00 nan 954,354.00 84,208.00 nan 14.00 272.00 2.00 0 nan nan 831,809.00 25.00
BLAKE JR. NORMAN P nan nan nan 1,279.00 nan nan nan nan nan nan 1,279.00 nan nan nan nan 0 113,784.00 -113,784.00 nan nan
BOWEN JR RAYMOND M 278,601.00 1,858.00 nan 2,669,589.00 nan 1,350,000.00 252,055.00 1,593.00 nan 252,055.00 65,907.00 nan 27.00 1,621.00 15.00 1 nan -833.00 974,293.00 140.00
BROWN MICHAEL nan 1,486.00 nan 49,288.00 nan nan nan 761.00 nan nan 49,288.00 nan 41.00 nan 1.00 0 nan nan nan 13.00
BUCHANAN HAROLD G 248,017.00 1,088.00 nan 1,054,637.00 825,464.00 500,000.00 189,041.00 23.00 nan 1,014,505.00 600.00 nan 125.00 1,215.00 0.00 0 nan nan 304,805.00 0.00
BUTTS ROBERT H 261,516.00 nan nan 1,271,582.00 nan 750,000.00 417,619.00 nan nan 417,619.00 9,410.00 nan nan 150,656.00 nan 0 nan -75,000.00 175,000.00 nan
BUY RICHARD B 330,546.00 3,523.00 649,584.00 2,355,702.00 2,542,813.00 900,000.00 901,657.00 2,333.00 nan 3,444,470.00 nan nan 1,053.00 400,572.00 71.00 0 nan -694,862.00 769,862.00 156.00
CALGER CHRISTOPHER F 240,189.00 2,598.00 nan 1,639,297.00 nan 1,250,000.00 126,027.00 2,188.00 nan 126,027.00 35,818.00 nan 144.00 486.00 25.00 1 nan -262,500.00 375,304.00 199.00
CARTER REBECCA C 261,809.00 312.00 nan 477,557.00 nan 300,000.00 307,301.00 196.00 -307,301.00 nan nan nan 15.00 540.00 7.00 0 nan -159,792.00 75,000.00 29.00
CAUSEY RICHARD A 415,189.00 1,892.00 nan 1,868,758.00 nan 1,000,000.00 2,502,063.00 1,585.00 nan 2,502,063.00 30,674.00 nan 49.00 307,895.00 12.00 1 nan -235,000.00 350,000.00 58.00
CHAN RONNIE nan nan nan nan nan nan 32,460.00 nan -32,460.00 nan nan nan nan nan nan 0 98,784.00 -98,784.00 nan nan
CHRISTODOULOU DIOMEDES nan nan nan nan 5,127,155.00 nan 950,730.00 nan nan 6,077,885.00 nan nan nan nan nan 0 nan nan nan nan
CLINE KENNETH W nan nan nan nan nan nan 662,086.00 nan -472,568.00 189,518.00 nan nan nan nan nan 0 nan nan nan nan
COLWELL WESLEY 288,542.00 1,758.00 27,610.00 1,490,344.00 nan 1,200,000.00 698,242.00 1,132.00 nan 698,242.00 16,514.00 nan 40.00 101,740.00 11.00 1 nan -144,062.00 nan 240.00
CORDES WILLIAM R nan 764.00 nan nan 651,850.00 nan 386,335.00 58.00 nan 1,038,185.00 nan nan 12.00 nan 0.00 0 nan nan nan 10.00
COX DAVID 314,288.00 102.00 nan 1,101,393.00 117,551.00 800,000.00 378,082.00 71.00 nan 495,633.00 27,861.00 nan 33.00 494.00 4.00 0 nan -41,250.00 nan 0.00
CUMBERLAND MICHAEL S 184,899.00 nan nan 807,956.00 nan 325,000.00 207,940.00 nan nan 207,940.00 22,344.00 nan nan 713.00 nan 0 nan nan 275,000.00 nan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
SCRIMSHAW MATTHEW nan nan nan nan 759,557.00 nan nan nan nan 759,557.00 nan nan nan nan nan 0 nan nan nan nan
SHANKMAN JEFFREY A 304,110.00 3,221.00 nan 3,038,702.00 1,441,898.00 2,000,000.00 630,137.00 1,730.00 nan 2,072,035.00 178,979.00 nan 2,681.00 1,191.00 83.00 0 nan nan 554,422.00 94.00
SHAPIRO RICHARD S 269,076.00 15,149.00 nan 1,057,548.00 607,837.00 650,000.00 379,164.00 4,527.00 nan 987,001.00 137,767.00 nan 1,215.00 705.00 65.00 0 nan nan nan 74.00
SHARP VICTORIA T 248,146.00 3,136.00 187,469.00 1,576,511.00 281,073.00 600,000.00 213,063.00 2,477.00 nan 494,136.00 116,337.00 nan 136.00 2,401.00 6.00 0 nan nan 422,158.00 24.00
SHELBY REX 211,844.00 225.00 nan 2,003,885.00 1,624,396.00 200,000.00 869,220.00 91.00 nan 2,493,616.00 22,884.00 nan 39.00 1,573,324.00 14.00 1 nan -4,167.00 nan 13.00
SHERRICK JEFFREY B nan 613.00 nan nan 1,426,469.00 nan 405,999.00 583.00 nan 1,832,468.00 nan nan 25.00 nan 18.00 0 nan nan nan 39.00
SHERRIFF JOHN R 428,780.00 3,187.00 nan 4,335,388.00 1,835,558.00 1,500,000.00 1,293,424.00 2,103.00 nan 3,128,982.00 nan nan 92.00 1,852,186.00 23.00 0 nan nan 554,422.00 28.00
SKILLING JEFFREY K 1,111,258.00 3,627.00 nan 8,682,716.00 19,250,000.00 5,600,000.00 6,843,672.00 2,042.00 nan 26,093,672.00 29,336.00 nan 108.00 22,122.00 30.00 1 nan nan 1,920,000.00 88.00
STABLER FRANK 239,502.00 nan nan 1,112,087.00 nan 500,000.00 511,734.00 nan nan 511,734.00 16,514.00 nan nan 356,071.00 nan 0 nan nan nan nan
SULLIVAN-SHAKLOVITZ COLLEEN 162,779.00 nan 181,993.00 999,356.00 1,362,375.00 100,000.00 nan nan nan 1,362,375.00 nan nan nan 162.00 nan 0 nan nan 554,422.00 nan
SUNDE MARTIN 257,486.00 2,647.00 nan 1,545,059.00 nan 700,000.00 698,920.00 2,565.00 nan 698,920.00 nan nan 38.00 111,122.00 13.00 0 nan nan 476,451.00 37.00
TAYLOR MITCHELL S 265,214.00 533.00 227,449.00 1,092,663.00 3,181,250.00 600,000.00 563,798.00 300.00 nan 3,745,048.00 nan nan 29.00 nan 0.00 0 nan nan nan 0.00
THE TRAVEL AGENCY IN THE PARK nan nan nan 362,096.00 nan nan nan nan nan nan nan nan nan 362,096.00 nan 0 nan nan nan nan
THORN TERENCE H 222,093.00 266.00 16,586.00 911,453.00 4,452,476.00 nan 365,320.00 73.00 nan 4,817,796.00 46,145.00 nan 41.00 426,629.00 0.00 0 nan nan 200,000.00 0.00
TILNEY ELIZABETH A 247,338.00 460.00 nan 399,393.00 591,250.00 300,000.00 576,792.00 379.00 nan 1,168,042.00 nan nan 19.00 152,055.00 11.00 0 nan -575,000.00 275,000.00 10.00
UMANOFF ADAM S 288,589.00 111.00 nan 1,130,461.00 nan 788,750.00 nan 41.00 nan nan 53,122.00 nan 18.00 nan 0.00 0 nan nan nan 12.00
URQUHART JOHN A nan nan nan 228,656.00 nan nan nan nan nan nan 228,656.00 nan nan nan nan 0 36,666.00 -36,666.00 nan nan
WAKEHAM JOHN nan nan nan 213,071.00 nan nan nan nan nan nan 103,773.00 nan nan nan nan 0 109,298.00 nan nan nan
WALLS JR ROBERT H 357,091.00 671.00 nan 1,798,780.00 4,346,544.00 850,000.00 1,552,453.00 215.00 nan 5,898,997.00 50,936.00 nan 146.00 2.00 0.00 0 nan nan 540,751.00 17.00
WALTERS GARETH W nan nan 53,625.00 87,410.00 1,030,329.00 nan nan nan nan 1,030,329.00 33,785.00 nan nan nan nan 0 nan nan nan nan
WASAFF GEORGE 259,996.00 400.00 831,299.00 1,034,395.00 1,668,260.00 325,000.00 388,167.00 337.00 nan 2,056,427.00 nan nan 30.00 1,425.00 7.00 0 nan -583,325.00 200,000.00 22.00
WESTFAHL RICHARD K 63,744.00 nan nan 762,135.00 nan nan 384,930.00 nan nan 384,930.00 51,870.00 nan nan 401,130.00 nan 0 nan -10,800.00 256,191.00 nan
WHALEY DAVID A nan nan nan nan 98,718.00 nan nan nan nan 98,718.00 nan nan nan nan nan 0 nan nan nan nan
WHALLEY LAWRENCE G 510,364.00 6,019.00 nan 4,677,574.00 3,282,960.00 3,000,000.00 2,796,177.00 3,920.00 nan 6,079,137.00 57,838.00 nan 556.00 301,026.00 24.00 0 nan nan 808,346.00 186.00
WHITE JR THOMAS E 317,543.00 nan nan 1,934,359.00 1,297,049.00 450,000.00 13,847,074.00 nan nan 15,144,123.00 81,353.00 nan nan 1,085,463.00 nan 0 nan nan nan nan
WINOKUR JR. HERBERT S nan nan nan 84,992.00 nan nan nan nan nan nan 1,413.00 nan nan nan nan 0 108,579.00 -25,000.00 nan nan
WODRASKA JOHN nan nan nan 189,583.00 nan nan nan nan nan nan nan nan nan 189,583.00 nan 0 nan nan nan nan
WROBEL BRUCE nan nan nan nan 139,130.00 nan nan nan nan 139,130.00 nan nan nan nan nan 0 nan nan nan nan
YEAGER F SCOTT 158,403.00 nan nan 360,300.00 8,308,552.00 nan 3,576,206.00 nan nan 11,884,758.00 53,947.00 nan nan 147,950.00 nan 1 nan nan nan nan
YEAP SOON nan nan nan 55,097.00 192,758.00 nan nan nan nan 192,758.00 55,097.00 nan nan nan nan 0 nan nan nan nan

145 rows × 20 columns

Enron data point count: 145

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       94 non-null float64
to_messages                  86 non-null float64
deferral_payments            38 non-null float64
total_payments               124 non-null float64
exercised_stock_options      101 non-null float64
bonus                        81 non-null float64
restricted_stock             109 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    17 non-null float64
total_stock_value            125 non-null float64
expenses                     94 non-null float64
loan_advances                3 non-null float64
from_messages                86 non-null float64
other                        92 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          145 non-null int64
director_fees                16 non-null float64
deferred_income              48 non-null float64
long_term_incentive          65 non-null float64
from_poi_to_this_person      86 non-null float64
dtypes: float64(19), int64(1)
memory usage: 23.8+ KB

DataFrame description:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
count 94.00 86.00 38.00 124.00 101.00 81.00 109.00 86.00 17.00 125.00 94.00 3.00 86.00 92.00 86.00 145.00 16.00 48.00 65.00 86.00
mean 284,087.54 2,073.86 841,602.53 2,623,421.18 2,959,559.26 1,201,773.07 1,147,424.09 1,176.47 621,892.82 3,352,073.02 54,192.01 27,975,000.00 608.79 465,276.66 41.23 0.12 89,822.88 -581,049.81 746,491.20 64.90
std 177,131.12 2,582.70 1,289,322.63 9,488,105.53 5,499,449.60 1,441,679.44 2,249,770.36 1,178.32 3,845,528.35 6,532,883.10 46,108.38 46,382,560.03 1,841.03 1,389,719.06 100.07 0.33 41,112.70 942,076.40 862,917.42 86.98
min 477.00 57.00 -102,500.00 148.00 3,285.00 70,000.00 -2,604,490.00 2.00 -1,787,380.00 -44,093.00 148.00 400,000.00 12.00 2.00 0.00 0.00 3,285.00 -3,504,386.00 69,223.00 0.00
25% 211,802.00 541.25 79,644.50 386,380.25 506,765.00 425,000.00 252,055.00 249.75 -329,825.00 494,136.00 22,479.00 1,200,000.00 22.75 1,209.00 1.00 0.00 83,674.50 -611,209.25 275,000.00 10.00
50% 258,741.00 1,211.00 221,063.50 1,100,246.50 1,297,049.00 750,000.00 441,096.00 740.50 -140,264.00 1,095,040.00 46,547.50 2,000,000.00 41.00 51,984.50 8.00 0.00 106,164.50 -151,927.00 422,158.00 35.00
75% 308,606.50 2,634.75 867,211.25 2,084,662.75 2,542,813.00 1,200,000.00 985,032.00 1,888.25 -72,419.00 2,606,763.00 78,408.50 41,762,500.00 145.50 357,577.25 24.75 0.00 112,815.00 -37,926.00 831,809.00 72.25
max 1,111,258.00 15,149.00 6,426,990.00 103,559,793.00 34,348,384.00 8,000,000.00 14,761,694.00 5,521.00 15,456,290.00 49,110,078.00 228,763.00 81,525,000.00 14,368.00 10,359,729.00 609.00 1.00 137,864.00 -833.00 5,145,434.00 528.00
Columns in dataframe: 20.
Columns missing values: 19.

Missing values table:
Missing Values % of Total Values
loan_advances 142 97.90
director_fees 129 89.00
restricted_stock_deferred 128 88.30
deferral_payments 107 73.80
deferred_income 97 66.90
long_term_incentive 80 55.20
bonus 64 44.10
from_poi_to_this_person 59 40.70
shared_receipt_with_poi 59 40.70
to_messages 59 40.70
from_this_person_to_poi 59 40.70
from_messages 59 40.70
other 53 36.60
salary 51 35.20
expenses 51 35.20
exercised_stock_options 44 30.30
restricted_stock 36 24.80
total_payments 21 14.50
total_stock_value 20 13.80
Correlation Heatmap:
Label value counts:
0    127
1     18
Name: poi, dtype: int64

Feature correlations to (poi) feature:
Correlation Absolute Correlation
loan_advances 1.00 1.00
exercised_stock_options 0.50 0.50
total_stock_value 0.37 0.37
bonus 0.30 0.30
deferred_income -0.27 0.27
salary 0.26 0.26
long_term_incentive 0.25 0.25
total_payments 0.23 0.23
shared_receipt_with_poi 0.23 0.23
restricted_stock 0.22 0.22
from_poi_to_this_person 0.17 0.17
other 0.12 0.12
from_this_person_to_poi 0.11 0.11
deferral_payments -0.10 0.10
from_messages -0.07 0.07
expenses 0.06 0.06
to_messages 0.06 0.06
Feature Boxplots:
Feature Swarmplots:
Outlier features:
['deferral_payments', 'restricted_stock', 'total_stock_value']
Total outliers removed:
3

Current features and labels shapes:
Enron labels shape: (145L,)
Enron features shape: (145L, 19L)

Selected features (with label):
['poi', 'total_payments', 'deferred_income', 'restricted_stock', 'expenses', 'total_stock_value', 'other', 'bonus', 'exercised_stock_options']

Feature Importances:
Current features and labels shapes:
Enron labels shape: (145L,)
Enron features shape: (145L, 8L)

Performing Model Optimizations...

Analyzing KMeans...
Fitting 100 folds for each of 16 candidates, totalling 1600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  60 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 360 tasks      | elapsed:    6.7s
[Parallel(n_jobs=8)]: Done 860 tasks      | elapsed:   17.7s
[Parallel(n_jobs=8)]: Done 1560 tasks      | elapsed:   32.2s
[Parallel(n_jobs=8)]: Done 1600 out of 1600 | elapsed:   33.0s finished

Best KMeans Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0))])

Best Estimator Overall: 0.4475
Best Estimator Accuracy: 0.8593
Best Estimator Recall: 0.0450
Best Estimator Precision: 0.0264
Analyzing RandomForestClassifier...
Fitting 100 folds for each of 2592 candidates, totalling 259200 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done 852 tasks      | elapsed:   14.7s
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed:   28.0s
[Parallel(n_jobs=8)]: Done 2452 tasks      | elapsed:   47.1s
[Parallel(n_jobs=8)]: Done 3552 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 4852 tasks      | elapsed:  1.8min
[Parallel(n_jobs=8)]: Done 6352 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done 8052 tasks      | elapsed:  2.8min
[Parallel(n_jobs=8)]: Done 9952 tasks      | elapsed:  3.7min
[Parallel(n_jobs=8)]: Done 12052 tasks      | elapsed:  4.3min
[Parallel(n_jobs=8)]: Done 14352 tasks      | elapsed:  5.4min
[Parallel(n_jobs=8)]: Done 16852 tasks      | elapsed:  6.1min
[Parallel(n_jobs=8)]: Done 19552 tasks      | elapsed:  7.3min
[Parallel(n_jobs=8)]: Done 22452 tasks      | elapsed:  8.2min
[Parallel(n_jobs=8)]: Done 25552 tasks      | elapsed:  9.4min
[Parallel(n_jobs=8)]: Done 28852 tasks      | elapsed: 10.7min
[Parallel(n_jobs=8)]: Done 32352 tasks      | elapsed: 11.8min
[Parallel(n_jobs=8)]: Done 36052 tasks      | elapsed: 13.2min
[Parallel(n_jobs=8)]: Done 39952 tasks      | elapsed: 14.7min
[Parallel(n_jobs=8)]: Done 44052 tasks      | elapsed: 16.2min
[Parallel(n_jobs=8)]: Done 48352 tasks      | elapsed: 17.9min
[Parallel(n_jobs=8)]: Done 52852 tasks      | elapsed: 19.6min
[Parallel(n_jobs=8)]: Done 57552 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 62452 tasks      | elapsed: 23.2min
[Parallel(n_jobs=8)]: Done 67552 tasks      | elapsed: 25.1min
[Parallel(n_jobs=8)]: Done 72852 tasks      | elapsed: 27.1min
[Parallel(n_jobs=8)]: Done 78352 tasks      | elapsed: 29.1min
[Parallel(n_jobs=8)]: Done 84052 tasks      | elapsed: 31.1min
[Parallel(n_jobs=8)]: Done 89952 tasks      | elapsed: 33.2min
[Parallel(n_jobs=8)]: Done 96052 tasks      | elapsed: 35.6min
[Parallel(n_jobs=8)]: Done 102352 tasks      | elapsed: 38.1min
[Parallel(n_jobs=8)]: Done 108852 tasks      | elapsed: 40.6min
[Parallel(n_jobs=8)]: Done 115552 tasks      | elapsed: 43.4min
[Parallel(n_jobs=8)]: Done 122452 tasks      | elapsed: 45.8min
[Parallel(n_jobs=8)]: Done 129552 tasks      | elapsed: 48.6min
[Parallel(n_jobs=8)]: Done 136852 tasks      | elapsed: 51.1min
[Parallel(n_jobs=8)]: Done 144352 tasks      | elapsed: 54.0min
[Parallel(n_jobs=8)]: Done 152052 tasks      | elapsed: 56.6min
[Parallel(n_jobs=8)]: Done 159952 tasks      | elapsed: 59.5min
[Parallel(n_jobs=8)]: Done 168052 tasks      | elapsed: 62.5min
[Parallel(n_jobs=8)]: Done 176352 tasks      | elapsed: 65.5min
[Parallel(n_jobs=8)]: Done 184852 tasks      | elapsed: 68.6min
[Parallel(n_jobs=8)]: Done 193552 tasks      | elapsed: 71.7min
[Parallel(n_jobs=8)]: Done 202452 tasks      | elapsed: 75.0min
[Parallel(n_jobs=8)]: Done 211552 tasks      | elapsed: 78.4min
[Parallel(n_jobs=8)]: Done 220852 tasks      | elapsed: 81.9min
[Parallel(n_jobs=8)]: Done 230352 tasks      | elapsed: 85.3min
[Parallel(n_jobs=8)]: Done 240052 tasks      | elapsed: 88.9min
[Parallel(n_jobs=8)]: Done 249952 tasks      | elapsed: 92.5min
[Parallel(n_jobs=8)]: Done 259200 out of 259200 | elapsed: 95.9min finished

Best RandomForestClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', RandomForestClassifier(bootstrap=T...mators=16, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

Best Estimator Overall: 0.5662
Best Estimator Accuracy: 0.8707
Best Estimator Recall: 0.2050
Best Estimator Precision: 0.3183
Analyzing GaussianNB...
Fitting 100 folds for each of 16 candidates, totalling 1600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 104 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 704 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done 1600 out of 1600 | elapsed:   18.5s finished

Best GaussianNB Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', GaussianNB(priors=None, var_smoothing=1e-09))])

Best Estimator Overall: 0.6391
Best Estimator Accuracy: 0.8667
Best Estimator Recall: 0.3700
Best Estimator Precision: 0.4532
Analyzing AdaBoostClassifier...
Fitting 100 folds for each of 576 candidates, totalling 57600 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   19.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   35.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   55.6s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  5.3min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:  7.4min
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:  8.1min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:  9.4min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed: 11.7min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed: 15.2min
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed: 16.5min
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed: 18.8min
[Parallel(n_jobs=8)]: Done 14434 tasks      | elapsed: 23.2min
[Parallel(n_jobs=8)]: Done 16184 tasks      | elapsed: 24.5min
[Parallel(n_jobs=8)]: Done 18034 tasks      | elapsed: 27.5min
[Parallel(n_jobs=8)]: Done 19984 tasks      | elapsed: 31.3min
[Parallel(n_jobs=8)]: Done 22034 tasks      | elapsed: 34.6min
[Parallel(n_jobs=8)]: Done 24184 tasks      | elapsed: 41.7min
[Parallel(n_jobs=8)]: Done 26434 tasks      | elapsed: 45.0min
[Parallel(n_jobs=8)]: Done 28784 tasks      | elapsed: 52.7min
[Parallel(n_jobs=8)]: Done 31234 tasks      | elapsed: 56.5min
[Parallel(n_jobs=8)]: Done 33784 tasks      | elapsed: 64.7min
[Parallel(n_jobs=8)]: Done 36434 tasks      | elapsed: 68.7min
[Parallel(n_jobs=8)]: Done 39184 tasks      | elapsed: 76.1min
[Parallel(n_jobs=8)]: Done 42034 tasks      | elapsed: 77.3min
[Parallel(n_jobs=8)]: Done 44984 tasks      | elapsed: 78.6min
[Parallel(n_jobs=8)]: Done 48034 tasks      | elapsed: 79.9min
[Parallel(n_jobs=8)]: Done 51184 tasks      | elapsed: 81.2min
[Parallel(n_jobs=8)]: Done 54434 tasks      | elapsed: 82.6min
[Parallel(n_jobs=8)]: Done 57600 out of 57600 | elapsed: 84.0min finished

Best AdaBoostClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', None), ('reduce_dim', None), ('classify', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1,
          n_estimators=32, random_state=42))])

Best Estimator Overall: 0.5520
Best Estimator Accuracy: 0.8507
Best Estimator Recall: 0.2200
Best Estimator Precision: 0.2867
Analyzing SVC...
Fitting 100 folds for each of 240 candidates, totalling 24000 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 712 tasks      | elapsed:    8.5s
[Parallel(n_jobs=8)]: Done 1712 tasks      | elapsed:   20.3s
[Parallel(n_jobs=8)]: Done 3112 tasks      | elapsed:   36.7s
[Parallel(n_jobs=8)]: Done 4912 tasks      | elapsed:   57.4s
[Parallel(n_jobs=8)]: Done 7112 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 9712 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done 12712 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 16112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 19912 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done 23529 tasks      | elapsed:  5.2min
[Parallel(n_jobs=8)]: Done 24000 out of 24000 | elapsed: 11.1min finished

Best SVC Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', None), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

Best Estimator Overall: 0.5258
Best Estimator Accuracy: 0.8740
Best Estimator Recall: 0.1300
Best Estimator Precision: 0.2250
Analyzing KNeighborsClassifier...
Fitting 100 folds for each of 960 candidates, totalling 96000 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done 852 tasks      | elapsed:   15.2s
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed:   28.1s
[Parallel(n_jobs=8)]: Done 2452 tasks      | elapsed:   41.0s
[Parallel(n_jobs=8)]: Done 3552 tasks      | elapsed:   57.3s
[Parallel(n_jobs=8)]: Done 4852 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 6352 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 8052 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done 9952 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done 12052 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 14352 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done 16852 tasks      | elapsed:  4.6min
[Parallel(n_jobs=8)]: Done 19552 tasks      | elapsed:  5.3min
[Parallel(n_jobs=8)]: Done 22452 tasks      | elapsed:  6.1min
[Parallel(n_jobs=8)]: Done 25552 tasks      | elapsed:  7.0min
[Parallel(n_jobs=8)]: Done 28852 tasks      | elapsed:  7.9min
[Parallel(n_jobs=8)]: Done 32352 tasks      | elapsed:  8.9min
[Parallel(n_jobs=8)]: Done 36052 tasks      | elapsed:  9.9min
[Parallel(n_jobs=8)]: Done 39952 tasks      | elapsed: 11.0min
[Parallel(n_jobs=8)]: Done 44052 tasks      | elapsed: 12.1min
[Parallel(n_jobs=8)]: Done 48352 tasks      | elapsed: 13.3min
[Parallel(n_jobs=8)]: Done 52852 tasks      | elapsed: 14.5min
[Parallel(n_jobs=8)]: Done 57552 tasks      | elapsed: 15.8min
[Parallel(n_jobs=8)]: Done 62452 tasks      | elapsed: 17.2min
[Parallel(n_jobs=8)]: Done 67552 tasks      | elapsed: 18.5min
[Parallel(n_jobs=8)]: Done 72852 tasks      | elapsed: 20.0min
[Parallel(n_jobs=8)]: Done 78352 tasks      | elapsed: 21.5min
[Parallel(n_jobs=8)]: Done 84052 tasks      | elapsed: 23.0min
[Parallel(n_jobs=8)]: Done 89952 tasks      | elapsed: 24.6min
[Parallel(n_jobs=8)]: Done 96000 out of 96000 | elapsed: 26.3min finished

Best KNeighborsClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', None), ('reduce_dim', None), ('classify', KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=1,
           weights='distance'))])

Best Estimator Overall: 0.5574
Best Estimator Accuracy: 0.8607
Best Estimator Recall: 0.2200
Best Estimator Precision: 0.2883
Analyzing DecisionTreeClassifier...
Fitting 100 folds for each of 384 candidates, totalling 38400 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 712 tasks      | elapsed:    8.5s
[Parallel(n_jobs=8)]: Done 1712 tasks      | elapsed:   20.3s
[Parallel(n_jobs=8)]: Done 3112 tasks      | elapsed:   36.6s
[Parallel(n_jobs=8)]: Done 4912 tasks      | elapsed:   59.1s
[Parallel(n_jobs=8)]: Done 7112 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 9712 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done 12712 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 16112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 19912 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done 24112 tasks      | elapsed:  4.7min
[Parallel(n_jobs=8)]: Done 28712 tasks      | elapsed:  5.6min
[Parallel(n_jobs=8)]: Done 33712 tasks      | elapsed:  6.6min
[Parallel(n_jobs=8)]: Done 38400 out of 38400 | elapsed:  7.5min finished

Best DecisionTreeClassifier Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', None), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
 ...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])

Best Estimator Overall: 0.6168
Best Estimator Accuracy: 0.8540
Best Estimator Recall: 0.3450
Best Estimator Precision: 0.4142
Total training time: 13556.569 s

Best Overall Results:
Best Estimator Overall: 0.6391
Best Estimator Accuracy: 0.8667
Best Estimator Recall: 0.3700
Best Estimator Precision: 0.4532

Best Overall Estimator Found:
Pipeline(memory='c:\\users\\carlos\\appdata\\local\\temp\\poi_id_czdcz_',
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', GaussianNB(priors=None, var_smoothing=1e-09))])